diff --git a/controllers/nvidiadriver_controller.go b/controllers/nvidiadriver_controller.go index 829dcff6b..0c7fe32bd 100644 --- a/controllers/nvidiadriver_controller.go +++ b/controllers/nvidiadriver_controller.go @@ -20,12 +20,14 @@ import ( "context" "fmt" "maps" + "os" "time" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/workqueue" @@ -57,6 +59,7 @@ type NVIDIADriverReconciler struct { stateManager state.Manager nodeSelectorValidator validator.Validator conditionUpdater conditions.Updater + operatorNamespace string } //+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers,verbs=get;list;watch;create;update;patch;delete @@ -124,6 +127,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request } clusterPolicyInstance := clusterPolicyList.Items[0] + r.operatorNamespace = os.Getenv("OPERATOR_NAMESPACE") + // Create a new InfoCatalog which is a generic interface for passing information to state managers infoCatalog := state.NewInfoCatalog() @@ -168,6 +173,16 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request return reconcile.Result{}, nil } + clusterpolicyDriverLabels, err := getClusterpolicyDriverLabels(r.ClusterInfo, clusterPolicyInstance) + if err != nil { + return reconcile.Result{}, fmt.Errorf("failed to get clusterpolicy driver labels: %w", err) + } + + err = updateNodesManagedByDriver(ctx, r, instance, clusterpolicyDriverLabels) + if err != nil { + return reconcile.Result{}, fmt.Errorf("failed to update nodes managed by driver: %w", err) + } + // Sync state and update status managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog) @@ -191,6 +206,9 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request } } // if no errors are reported from any state, then we would be waiting on driver daemonset pods + // TODO: Avoid marking 'default' NVIDIADriver instances as NotReady if DesiredNumberScheduled == 0. + // This will avoid unnecessary reconciliations when the 'default' instance is overriden on all + // GPU nodes, and thus, DesiredNumberScheduled being 0 is valid. if errorInfo == nil { condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.DriverNotReady, "Waiting for driver pod to be ready") if condErr != nil { @@ -404,5 +422,154 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl. return fmt.Errorf("failed to add index key: %w", err) } + if err := mgr.GetFieldIndexer().IndexField(ctx, &corev1.Pod{}, "spec.nodeName", func(rawObj client.Object) []string { + pod := rawObj.(*corev1.Pod) + return []string{pod.Spec.NodeName} + }); err != nil { + return err + } + return nil } + +func updateNodesManagedByDriver(ctx context.Context, r *NVIDIADriverReconciler, instance *nvidiav1alpha1.NVIDIADriver, clusterpolicyDriverLabels map[string]string) error { + nodes, err := getNVIDIADriverSelectedNodes(ctx, r.Client, instance) + if err != nil { + return fmt.Errorf("failed to get selected nodes for NVIDIADriver CR: %w", err) + } + + // A map tracking which node objects need to be updated. E.g. updated label / annotations + // need to be applied. + nodesToUpdate := map[*corev1.Node]struct{}{} + + for _, node := range nodes.Items { + labels := node.GetLabels() + annotations := node.GetAnnotations() + + managedBy, exists := labels["nvidia.com/gpu.driver.managed-by"] + if !exists { + // if 'managed-by' label does not exist, label node with cr.Name + labels["nvidia.com/gpu.driver.managed-by"] = instance.Name + node.SetLabels(labels) + nodesToUpdate[&node] = struct{}{} + // If there is an orphan driver pod running on the node, + // indicate to the upgrade controller that an upgrade is required. + // This will occur when we are migrating from a Clusterpolicy owned + // daemonset to a NVIDIADriver owned daemonset. + podList := &corev1.PodList{} + err = r.Client.List(ctx, podList, + client.InNamespace(r.operatorNamespace), + client.MatchingLabels(clusterpolicyDriverLabels), + client.MatchingFields{"spec.nodeName": node.Name}) + if err != nil { + return fmt.Errorf("failed to list driver pods: %w", err) + } + if len(podList.Items) == 0 { + continue + } + pod := podList.Items[0] + if pod.OwnerReferences == nil || len(pod.OwnerReferences) == 0 { + annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true" + node.SetAnnotations(annotations) + } + continue + } + + // do nothing if node is already being managed by this CR + if managedBy == instance.Name { + continue + } + + // If node is 'managed-by' another CR, there are several scenarios + // 1) There is no driver pod running on the node. Therefore the label is stale. + // 2) There is a driver pod running on the node, and it is not an orphan. There are + // two possible actions: + // a) If the NVIDIADriver instance we are currently reconciling is the 'default' + // instance (the node selector is empty), do nothing. All other NVIDIADriver + // instances take precedence. + // b) The pod running no longer falls into the node pool of the CR it is currently + // being managed by. Thus, the NVIDIADriver instance we are currently reconciling + // should take ownership of the node. + podList := &corev1.PodList{} + err = r.Client.List(ctx, podList, + client.InNamespace(r.operatorNamespace), + client.MatchingLabels(map[string]string{AppComponentLabelKey: AppComponentLabelValue}), + client.MatchingFields{"spec.nodeName": node.Name}) + if err != nil { + return fmt.Errorf("failed to list driver pods: %w", err) + } + if len(podList.Items) == 0 { + labels["nvidia.com/gpu.driver.managed-by"] = instance.Name + node.SetLabels(labels) + nodesToUpdate[&node] = struct{}{} + continue + } + if instance.Spec.NodeSelector == nil || len(instance.Spec.NodeSelector) == 0 { + // If the nodeSelector for the NVIDIADriver instance is empty, then we + // treat it as the 'default' CR which has the lowest precedence. Allow + // the existing driver pod, owned by another NVIDIADriver CR, to continue + // to run. + continue + } + pod := podList.Items[0] + if pod.OwnerReferences != nil && len(pod.OwnerReferences) > 0 { + err := r.Client.Patch(ctx, &pod, client.RawPatch(types.MergePatchType, []byte(fmt.Sprintf(`{"metadata":{"labels":{"app": null}}}`)))) + if err != nil { + return fmt.Errorf("failed to patch pod in order to make it an orphan: %w", err) + } + } + labels["nvidia.com/gpu.driver.managed-by"] = instance.Name + annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true" + node.SetLabels(labels) + node.SetAnnotations(annotations) + nodesToUpdate[&node] = struct{}{} + } + + // Apply updated labels / annotations on node objects + for node := range nodesToUpdate { + err = r.Client.Update(ctx, node) + if err != nil { + return fmt.Errorf("failed to update node %s: %w", node.Name, err) + } + } + + return nil +} + +// getNVIDIADriverSelectedNodes returns selected nodes based on the nodeselector labels set for a given NVIDIADriver instance +func getNVIDIADriverSelectedNodes(ctx context.Context, k8sClient client.Client, cr *nvidiav1alpha1.NVIDIADriver) (*corev1.NodeList, error) { + nodeList := &corev1.NodeList{} + + if cr.Spec.NodeSelector == nil { + cr.Spec.NodeSelector = cr.GetNodeSelector() + } + + selector := labels.Set(cr.Spec.NodeSelector).AsSelector() + + opts := []client.ListOption{ + client.MatchingLabelsSelector{Selector: selector}, + } + err := k8sClient.List(ctx, nodeList, opts...) + + return nodeList, err +} + +// getClusterpolicyDriverLabels returns a set of labels that can be used to identify driver pods running that are owned by Clusterpolicy +func getClusterpolicyDriverLabels(clusterInfo clusterinfo.Interface, clusterpolicy gpuv1.ClusterPolicy) (map[string]string, error) { + // initialize with common app=nvidia-driver-daemonset label + driverLabelKey := DriverLabelKey + driverLabelValue := DriverLabelValue + + ocpVer, err := clusterInfo.GetOpenshiftVersion() + if err != nil { + return nil, fmt.Errorf("failed to get the OpenShift version: %w", err) + } + if ocpVer != "" && clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit != nil && *clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit == true { + // For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not + // constant and changes based on rhcos version. Hence use DTK label instead + driverLabelKey = ocpDriverToolkitIdentificationLabel + driverLabelValue = ocpDriverToolkitIdentificationValue + } + + return map[string]string{driverLabelKey: driverLabelValue}, nil +} diff --git a/internal/state/driver.go b/internal/state/driver.go index 34e19ef71..284a6af6d 100644 --- a/internal/state/driver.go +++ b/internal/state/driver.go @@ -548,6 +548,7 @@ func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverS return &driverSpec{ Spec: spec, + CRName: cr.Name, AppName: nvidiaDriverAppName, Name: nvidiaDriverName, ImagePath: imagePath, diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index a591fa9d3..77992e137 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -624,6 +624,7 @@ func getMinimalDriverRenderData() *driverRenderData { ReadinessProbe: getDefaultContainerProbeSpec(), DriverType: nvidiav1alpha1.GPU, }, + CRName: "test-cr", AppName: "nvidia-gpu-driver-ubuntu22.04-7c6d7bd86b", Name: "nvidia-gpu-driver-ubuntu22.04", ImagePath: "nvcr.io/nvidia/driver:525.85.03-ubuntu22.04", diff --git a/internal/state/testdata/golden/driver-additional-configs.yaml b/internal/state/testdata/golden/driver-additional-configs.yaml index 70fdbbc15..c4c978855 100644 --- a/internal/state/testdata/golden/driver-additional-configs.yaml +++ b/internal/state/testdata/golden/driver-additional-configs.yaml @@ -240,6 +240,7 @@ spec: name: run-mellanox-drivers nodeSelector: nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-ubuntu22.04 tolerations: diff --git a/internal/state/testdata/golden/driver-full-spec.yaml b/internal/state/testdata/golden/driver-full-spec.yaml index 8f76608ed..b7df38f12 100644 --- a/internal/state/testdata/golden/driver-full-spec.yaml +++ b/internal/state/testdata/golden/driver-full-spec.yaml @@ -254,9 +254,8 @@ spec: mountPropagation: HostToContainer name: run-mellanox-drivers nodeSelector: - example.com/bar: bar - example.com/foo: foo nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: custom-priority-class-name serviceAccountName: nvidia-gpu-driver-ubuntu22.04 tolerations: diff --git a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml index db43ce253..4dbd77a71 100644 --- a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml +++ b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml @@ -407,6 +407,7 @@ spec: nodeSelector: feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0 nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-openshift tolerations: diff --git a/internal/state/testdata/golden/driver-gdrcopy.yaml b/internal/state/testdata/golden/driver-gdrcopy.yaml index b8e6fe192..a0b762458 100644 --- a/internal/state/testdata/golden/driver-gdrcopy.yaml +++ b/internal/state/testdata/golden/driver-gdrcopy.yaml @@ -296,6 +296,7 @@ spec: name: run-mellanox-drivers nodeSelector: nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-ubuntu22.04 tolerations: diff --git a/internal/state/testdata/golden/driver-gds.yaml b/internal/state/testdata/golden/driver-gds.yaml index 3851f4e1e..44361f655 100644 --- a/internal/state/testdata/golden/driver-gds.yaml +++ b/internal/state/testdata/golden/driver-gds.yaml @@ -296,6 +296,7 @@ spec: name: run-mellanox-drivers nodeSelector: nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-ubuntu22.04 tolerations: diff --git a/internal/state/testdata/golden/driver-minimal.yaml b/internal/state/testdata/golden/driver-minimal.yaml index 6908224aa..7f5cec264 100644 --- a/internal/state/testdata/golden/driver-minimal.yaml +++ b/internal/state/testdata/golden/driver-minimal.yaml @@ -231,6 +231,7 @@ spec: name: run-mellanox-drivers nodeSelector: nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-ubuntu22.04 tolerations: diff --git a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml index a171e8da3..7fb80e1ac 100644 --- a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml +++ b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml @@ -341,6 +341,7 @@ spec: nodeSelector: feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0 nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-openshift tolerations: diff --git a/internal/state/testdata/golden/driver-precompiled.yaml b/internal/state/testdata/golden/driver-precompiled.yaml index d87a9980c..4a8ecd1c8 100644 --- a/internal/state/testdata/golden/driver-precompiled.yaml +++ b/internal/state/testdata/golden/driver-precompiled.yaml @@ -234,6 +234,7 @@ spec: nodeSelector: feature.node.kubernetes.io/kernel-version.full: 5.4.0-150-generic nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-ubuntu22.04 tolerations: diff --git a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml index c0f70d584..94d7aa5b7 100644 --- a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml +++ b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml @@ -307,6 +307,7 @@ spec: name: run-mellanox-drivers nodeSelector: nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-ubuntu22.04 tolerations: diff --git a/internal/state/testdata/golden/driver-rdma.yaml b/internal/state/testdata/golden/driver-rdma.yaml index d26cd0f23..d2cf9def0 100644 --- a/internal/state/testdata/golden/driver-rdma.yaml +++ b/internal/state/testdata/golden/driver-rdma.yaml @@ -301,6 +301,7 @@ spec: name: run-mellanox-drivers nodeSelector: nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-ubuntu22.04 tolerations: diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml index 5e329229b..4af641056 100644 --- a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml +++ b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml @@ -302,6 +302,7 @@ spec: nodeSelector: feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0 nvidia.com/gpu.deploy.vgpu-manager: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-vgpu-manager-openshift tolerations: diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml index d998d4420..73f1cf379 100644 --- a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml +++ b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml @@ -217,6 +217,7 @@ spec: name: run-mellanox-drivers nodeSelector: nvidia.com/gpu.deploy.vgpu-manager: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-vgpu-manager-ubuntu22.04 tolerations: diff --git a/internal/state/testdata/golden/driver-vgpu-licensing.yaml b/internal/state/testdata/golden/driver-vgpu-licensing.yaml index b040bbe93..4c55cb93e 100644 --- a/internal/state/testdata/golden/driver-vgpu-licensing.yaml +++ b/internal/state/testdata/golden/driver-vgpu-licensing.yaml @@ -237,6 +237,7 @@ spec: name: run-mellanox-drivers nodeSelector: nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.driver.managed-by: test-cr priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-driver-ubuntu22.04 tolerations: diff --git a/internal/state/types.go b/internal/state/types.go index 000eb10f2..cd25b3abb 100644 --- a/internal/state/types.go +++ b/internal/state/types.go @@ -30,6 +30,7 @@ type SyncingSource source.SyncingSource // which is to be populated with the fully-qualified image path. type driverSpec struct { Spec *nvidiav1alpha1.NVIDIADriverSpec + CRName string AppName string Name string ImagePath string diff --git a/internal/validator/validator.go b/internal/validator/validator.go index e43c8127c..f15d18d81 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -51,8 +51,30 @@ func (nsv *nodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alph return err } + crHasEmptyNodeSelector := false + if cr.Spec.NodeSelector == nil || len(cr.Spec.NodeSelector) == 0 { + crHasEmptyNodeSelector = true + } + names := map[string]struct{}{} - for di := range drivers.Items { + for di, driver := range drivers.Items { + if driver.Name == cr.Name { + continue + } + + if crHasEmptyNodeSelector { + // If the CR we are validating has an empty node selector, it does not conflict + // with any other CR unless there is another CR that also is configured with an + // empty node selector. Only one NVIDIADriver instance can be configured with an + // empty node selector at any point in time. We deem such an instance as the 'default' + // instance, as it will get deployed on all GPU nodes. Other CRs, with non-empty + // node selectors, can override the 'default' NVIDIADriver instance. + if driver.Spec.NodeSelector == nil || len(driver.Spec.NodeSelector) == 0 { + return fmt.Errorf("cannot have empty nodeSelector as another CR is configured with an empty nodeSelector: %s", driver.Name) + } + continue + } + nodeList, err := nsv.getNVIDIADriverSelectedNodes(ctx, &drivers.Items[di]) if err != nil { return err diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go index 8171f6cf5..0cd7281e9 100644 --- a/internal/validator/validator_test.go +++ b/internal/validator/validator_test.go @@ -93,6 +93,7 @@ func makeTestNode(opts ...nodeOptions) *corev1.Node { return n } +// TODO: update this test function func TestCheckNodeSelector(t *testing.T) { node := makeTestNode(labelled(map[string]string{"os-version": "ubuntu20.04"})) driver := makeTestDriver(nodeSelector(node.Labels)) diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml index 4f21fe1c1..b508ba8bc 100644 --- a/manifests/state-driver/0500_daemonset.yaml +++ b/manifests/state-driver/0500_daemonset.yaml @@ -60,14 +60,12 @@ spec: {{- end }} spec: nodeSelector: + nvidia.com/gpu.driver.managed-by: {{ .Driver.CRName | quote }} {{- if eq .Driver.Spec.DriverType "vgpu-host-manager" }} nvidia.com/gpu.deploy.vgpu-manager: "true" {{- else }} nvidia.com/gpu.deploy.driver: "true" {{- end }} - {{- if .Driver.Spec.NodeSelector }} - {{- .Driver.Spec.NodeSelector | yaml | nindent 8 }} - {{- end }} {{- if and (.Openshift) (.Runtime.OpenshiftDriverToolkitEnabled) }} feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: {{ .Openshift.RHCOSVersion | quote }} {{- end }}