Skip to content

Commit

Permalink
Migrate from ClusterPolicy to NVIDIADriver owned driver daemonsets
Browse files Browse the repository at this point in the history
Signed-off-by: Christopher Desiniotis <[email protected]>
  • Loading branch information
cdesiniotis committed Jun 5, 2024
1 parent 58afc29 commit cd34957
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
4 changes: 2 additions & 2 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -3659,7 +3659,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) {
return true, nil
}

func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error {
func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context, deleteOptions *client.DeleteOptions) error {
// Get all DaemonSets owned by ClusterPolicy
//
// (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector
Expand All @@ -3676,7 +3676,7 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context)
// filter out DaemonSets which are not the NVIDIA driver/vgpu-manager
if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) {
n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name)
err = n.client.Delete(ctx, &ds)
err = n.client.Delete(ctx, &ds, deleteOptions)
if err != nil {
return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err)
}
Expand Down
8 changes: 6 additions & 2 deletions controllers/state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -952,8 +952,12 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) {
n.singleton.Spec.Driver.UseNvdiaDriverCRDType() {
n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy")
n.idx++
// Cleanup all driver daemonsets owned by ClusterPolicy.
err := n.cleanupAllDriverDaemonSets(n.ctx)
// Cleanup all driver daemonsets owned by ClusterPolicy, but orphan the dependent pod objects.
// This way, switching to the new NVIDIADriver API does not cause a cluster-wide disruption.
// NVIDIA driver pods owned by ClusterPolicy daemonsets will remain running until the NVIDIADriver
// controller migrates these pods to new ones owned by NVIDIADriver daemonsets.
deletePropagationOrphan := metav1.DeletePropagationOrphan
err := n.cleanupAllDriverDaemonSets(n.ctx, &client.DeleteOptions{PropagationPolicy: &deletePropagationOrphan})
if err != nil {
return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err)
}
Expand Down

0 comments on commit cd34957

Please sign in to comment.