From a8fe43342956f7ead8c723d9cfc1dd677d5b6efc Mon Sep 17 00:00:00 2001 From: Arjun Date: Mon, 1 Jul 2024 22:41:02 +0000 Subject: [PATCH 1/3] Persistence Driver Changes Signed-off-by: Arjun --- api/nvidia/v1alpha1/nvidiadriver_types.go | 6 ++++++ api/nvidia/v1alpha1/zz_generated.deepcopy.go | 5 +++++ config/crd/bases/nvidia.com_nvidiadrivers.yaml | 7 +++++++ .../gpu-operator/crds/nvidia.com_nvidiadrivers.yaml | 7 +++++++ 4 files changed, 25 insertions(+) diff --git a/api/nvidia/v1alpha1/nvidiadriver_types.go b/api/nvidia/v1alpha1/nvidiadriver_types.go index 86bae0b48..d5e1cd206 100644 --- a/api/nvidia/v1alpha1/nvidiadriver_types.go +++ b/api/nvidia/v1alpha1/nvidiadriver_types.go @@ -59,6 +59,12 @@ type NVIDIADriverSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" UseOpenKernelModules *bool `json:"useOpenKernelModules,omitempty"` + // PersistDriver indicates if the driver install should be persisted across restarts + PersistDriver *bool `json:"persist,omitempty"` + + // InstallDirectory is the install location for the driver + InstallDirectory string `json:"installDirectory,omitempty"` + // NVIDIA Driver container startup probe settings StartupProbe *ContainerProbeSpec `json:"startupProbe,omitempty"` diff --git a/api/nvidia/v1alpha1/zz_generated.deepcopy.go b/api/nvidia/v1alpha1/zz_generated.deepcopy.go index a7b23f115..8ba5bee65 100644 --- a/api/nvidia/v1alpha1/zz_generated.deepcopy.go +++ b/api/nvidia/v1alpha1/zz_generated.deepcopy.go @@ -313,6 +313,11 @@ func (in *NVIDIADriverSpec) DeepCopyInto(out *NVIDIADriverSpec) { *out = new(bool) **out = **in } + if in.PersistDriver != nil { + in, out := &in.PersistDriver, &out.PersistDriver + *out = new(bool) + **out = **in + } if in.StartupProbe != nil { in, out := &in.StartupProbe, &out.StartupProbe *out = new(ContainerProbeSpec) diff --git a/config/crd/bases/nvidia.com_nvidiadrivers.yaml b/config/crd/bases/nvidia.com_nvidiadrivers.yaml index c040201f8..f6765f89f 100644 --- a/config/crd/bases/nvidia.com_nvidiadrivers.yaml +++ b/config/crd/bases/nvidia.com_nvidiadrivers.yaml @@ -204,6 +204,9 @@ spec: items: type: string type: array + installDirectory: + description: InstallDirectory is the install location for the driver + type: string kernelModuleConfig: description: 'Optional: Kernel module configuration parameters for the NVIDIA Driver' @@ -511,6 +514,10 @@ spec: description: NodeSelector specifies a selector for installation of NVIDIA driver type: object + persist: + description: PersistDriver indicates if the driver install should + be persisted across restarts + type: boolean priorityClassName: description: 'Optional: Set priorityClassName' type: string diff --git a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml index c040201f8..f6765f89f 100644 --- a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml @@ -204,6 +204,9 @@ spec: items: type: string type: array + installDirectory: + description: InstallDirectory is the install location for the driver + type: string kernelModuleConfig: description: 'Optional: Kernel module configuration parameters for the NVIDIA Driver' @@ -511,6 +514,10 @@ spec: description: NodeSelector specifies a selector for installation of NVIDIA driver type: object + persist: + description: PersistDriver indicates if the driver install should + be persisted across restarts + type: boolean priorityClassName: description: 'Optional: Set priorityClassName' type: string From 7f829b40043bcaa8a778a044d857eadaca3e5235 Mon Sep 17 00:00:00 2001 From: Arjun Date: Wed, 3 Jul 2024 18:14:12 +0000 Subject: [PATCH 2/3] new persistence changes --- .../samples/nvidia_v1alpha1_nvidiadriver.yaml | 6 ++- manifest.diff | 51 +++++++++++++++++++ manifests/state-driver/0500_daemonset.yaml | 26 ++++++++++ 3 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 manifest.diff diff --git a/config/samples/nvidia_v1alpha1_nvidiadriver.yaml b/config/samples/nvidia_v1alpha1_nvidiadriver.yaml index 1a73fab95..073b93696 100644 --- a/config/samples/nvidia_v1alpha1_nvidiadriver.yaml +++ b/config/samples/nvidia_v1alpha1_nvidiadriver.yaml @@ -6,9 +6,9 @@ spec: # use pre-compiled packages for NVIDIA driver installation. usePrecompiled: false driverType: gpu - repository: nvcr.io/nvidia + repository: "1837582943" #nvcr.io/nvidia image: driver - version: "550.90.07" + version: "550.54.17" imagePullPolicy: IfNotPresent imagePullSecrets: [] nodeSelector: {} @@ -18,6 +18,8 @@ spec: useHostMofed: false gds: enabled: false + persist: true + installDirectory: '/opt/nvidia/driver' # Private mirror repository configuration repoConfig: name: "" diff --git a/manifest.diff b/manifest.diff new file mode 100644 index 000000000..c8282f63f --- /dev/null +++ b/manifest.diff @@ -0,0 +1,51 @@ +diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml +index 8ceb7820c..8716d147d 100644 +--- a/manifests/state-driver/0500_daemonset.yaml ++++ b/manifests/state-driver/0500_daemonset.yaml +@@ -205,6 +205,12 @@ spec: + # always use runc for driver containers + - name: NVIDIA_VISIBLE_DEVICES + value: void ++ {{- if .Driver.Spec.PersistDriver }} ++ - name: RESTARTS_ENABLED ++ value: "true" ++ - name: INSTALL_DIR ++ value: {{ .Driver.Spec.InstallDirectory | default "/opt/nvidia/driver" }} ++ {{- end }} + {{- if deref .Driver.Spec.UseOpenKernelModules }} + - name: OPEN_KERNEL_MODULES_ENABLED + value: "true" +@@ -254,6 +260,14 @@ spec: + {{- end }} + {{- end }} + volumeMounts: ++ {{- if .Driver.Spec.PersistDriver }} ++ - name: install-dir ++ mountPath: {{ .Driver.Spec.InstallDirectory | default "/opt/nvidia/driver" }} ++ - name: lib-modules ++ mountPath: /lib/modules ++ - name: dev ++ mountPath: /dev ++ {{- end }} + - name: run-nvidia + mountPath: /run/nvidia + mountPropagation: Bidirectional +@@ -574,6 +588,18 @@ spec: + readOnly: true + {{- end }} + volumes: ++ {{- if .Driver.Spec.PersistDriver }} ++ - name: install-dir ++ hostPath: ++ path: {{ .Driver.Spec.InstallDirectory | default "/opt/nvidia/driver" }} ++ type: DirectoryOrCreate ++ - name: lib-modules ++ hostPath: ++ path: /lib/modules ++ - name: dev ++ hostPath: ++ path: /dev ++ {{- end }} + - name: run-nvidia + hostPath: + path: /run/nvidia diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml index 8ceb7820c..8716d147d 100644 --- a/manifests/state-driver/0500_daemonset.yaml +++ b/manifests/state-driver/0500_daemonset.yaml @@ -205,6 +205,12 @@ spec: # always use runc for driver containers - name: NVIDIA_VISIBLE_DEVICES value: void + {{- if .Driver.Spec.PersistDriver }} + - name: RESTARTS_ENABLED + value: "true" + - name: INSTALL_DIR + value: {{ .Driver.Spec.InstallDirectory | default "/opt/nvidia/driver" }} + {{- end }} {{- if deref .Driver.Spec.UseOpenKernelModules }} - name: OPEN_KERNEL_MODULES_ENABLED value: "true" @@ -254,6 +260,14 @@ spec: {{- end }} {{- end }} volumeMounts: + {{- if .Driver.Spec.PersistDriver }} + - name: install-dir + mountPath: {{ .Driver.Spec.InstallDirectory | default "/opt/nvidia/driver" }} + - name: lib-modules + mountPath: /lib/modules + - name: dev + mountPath: /dev + {{- end }} - name: run-nvidia mountPath: /run/nvidia mountPropagation: Bidirectional @@ -574,6 +588,18 @@ spec: readOnly: true {{- end }} volumes: + {{- if .Driver.Spec.PersistDriver }} + - name: install-dir + hostPath: + path: {{ .Driver.Spec.InstallDirectory | default "/opt/nvidia/driver" }} + type: DirectoryOrCreate + - name: lib-modules + hostPath: + path: /lib/modules + - name: dev + hostPath: + path: /dev + {{- end }} - name: run-nvidia hostPath: path: /run/nvidia From 651ed3ab39df2ee3bdeb23c56fe09f837156f226 Mon Sep 17 00:00:00 2001 From: Arjun Date: Wed, 3 Jul 2024 21:06:26 +0000 Subject: [PATCH 3/3] updated changes --- config/samples/nvidia_v1alpha1_nvidiadriver.yaml | 6 +++--- manifest.diff | 2 +- manifests/state-driver/0500_daemonset.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/config/samples/nvidia_v1alpha1_nvidiadriver.yaml b/config/samples/nvidia_v1alpha1_nvidiadriver.yaml index 073b93696..08ad58c66 100644 --- a/config/samples/nvidia_v1alpha1_nvidiadriver.yaml +++ b/config/samples/nvidia_v1alpha1_nvidiadriver.yaml @@ -6,9 +6,9 @@ spec: # use pre-compiled packages for NVIDIA driver installation. usePrecompiled: false driverType: gpu - repository: "1837582943" #nvcr.io/nvidia + repository: nvcr.io/nvidia image: driver - version: "550.54.17" + version: "550.90.07" imagePullPolicy: IfNotPresent imagePullSecrets: [] nodeSelector: {} @@ -18,7 +18,7 @@ spec: useHostMofed: false gds: enabled: false - persist: true + persist: false installDirectory: '/opt/nvidia/driver' # Private mirror repository configuration repoConfig: diff --git a/manifest.diff b/manifest.diff index c8282f63f..56c4beb73 100644 --- a/manifest.diff +++ b/manifest.diff @@ -7,7 +7,7 @@ index 8ceb7820c..8716d147d 100644 - name: NVIDIA_VISIBLE_DEVICES value: void + {{- if .Driver.Spec.PersistDriver }} -+ - name: RESTARTS_ENABLED ++ - name: PERSIST_DRIVER + value: "true" + - name: INSTALL_DIR + value: {{ .Driver.Spec.InstallDirectory | default "/opt/nvidia/driver" }} diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml index 8716d147d..4f54b6215 100644 --- a/manifests/state-driver/0500_daemonset.yaml +++ b/manifests/state-driver/0500_daemonset.yaml @@ -206,7 +206,7 @@ spec: - name: NVIDIA_VISIBLE_DEVICES value: void {{- if .Driver.Spec.PersistDriver }} - - name: RESTARTS_ENABLED + - name: PERSIST_DRIVER value: "true" - name: INSTALL_DIR value: {{ .Driver.Spec.InstallDirectory | default "/opt/nvidia/driver" }}