diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 07114c39f..6906bf469 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -3659,7 +3659,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) { return true, nil } -func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error { +func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context, deleteOptions *client.DeleteOptions) error { // Get all DaemonSets owned by ClusterPolicy // // (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector @@ -3676,7 +3676,7 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) // filter out DaemonSets which are not the NVIDIA driver/vgpu-manager if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) { n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name) - err = n.client.Delete(ctx, &ds) + err = n.client.Delete(ctx, &ds, deleteOptions) if err != nil { return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err) } diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 9c1028ebc..7f086386d 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -952,8 +952,12 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) { n.singleton.Spec.Driver.UseNvdiaDriverCRDType() { n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy") n.idx++ - // Cleanup all driver daemonsets owned by ClusterPolicy. - err := n.cleanupAllDriverDaemonSets(n.ctx) + // Cleanup all driver daemonsets owned by ClusterPolicy, but orphan the dependent pod objects. + // This way, switching to the new NVIDIADriver API does not cause a cluster-wide disruption. + // NVIDIA driver pods owned by ClusterPolicy daemonsets will remain running until the NVIDIADriver + // controller migrates these pods to new ones owned by NVIDIADriver daemonsets. + deletePropagationOrphan := metav1.DeletePropagationOrphan + err := n.cleanupAllDriverDaemonSets(n.ctx, &client.DeleteOptions{PropagationPolicy: &deletePropagationOrphan}) if err != nil { return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err) }