Skip to content

Commit

Permalink
Enable out-of-service taint in FAR
Browse files Browse the repository at this point in the history
  • Loading branch information
k-keiichi-rh committed Oct 12, 2023
1 parent b2d3419 commit c2e03f9
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 22 deletions.
10 changes: 10 additions & 0 deletions api/v1alpha1/fenceagentsremediation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const (
FARFinalizer string = "fence-agents-remediation.medik8s.io/far-finalizer"
// Taints
FARNoExecuteTaintKey = "medik8s.io/fence-agents-remediation"
OutOfServiceTaintKey = "node.kubernetes.io/out-of-service"
// FenceAgentActionSucceededType is the condition type used to signal whether the Fence Agent action was succeeded successfully or not
FenceAgentActionSucceededType = "FenceAgentActionSucceeded"
// condition messages
Expand All @@ -52,10 +53,14 @@ const (
FenceAgentSucceeded ConditionsChangeReason = "FenceAgentSucceeded"
// RemediationFinishedSuccessfully - The unhealthy node was fully remediated/fenced (it was tainted, fenced by FA and all of its resources have been deleted)
RemediationFinishedSuccessfully ConditionsChangeReason = "RemediationFinishedSuccessfully"

ResourceDeletionRemediationStrategy = RemediationStrategyType("ResourceDeletion")
OutOfServiceTaintRemediationStrategy = RemediationStrategyType("OutOfServiceTaint")
)

type ParameterName string
type NodeName string
type RemediationStrategyType string

// FenceAgentsRemediationSpec defines the desired state of FenceAgentsRemediation
type FenceAgentsRemediationSpec struct {
Expand All @@ -70,6 +75,11 @@ type FenceAgentsRemediationSpec struct {
// NodeParameters are passed to the fencing agent according to the node that is fenced, since they are node specific
//+operator-sdk:csv:customresourcedefinitions:type=spec
NodeParameters map[ParameterName]map[NodeName]string `json:"nodeparameters,omitempty"`

//RemediationStrategy is the remediation method for unhealthy nodes.
// +kubebuilder:default:="ResourceDeletion"
// +kubebuilder:validation:Enum=ResourceDeletion;OutOfServiceTaint
RemediationStrategy RemediationStrategyType `json:"remediationStrategy,omitempty"`
}

// FenceAgentsRemediationStatus defines the observed state of FenceAgentsRemediation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ spec:
description: NodeParameters are passed to the fencing agent according
to the node that is fenced, since they are node specific
type: object
remediationStrategy:
default: ResourceDeletion
description: RemediationStrategy is the remediation method for unhealthy
nodes.
enum:
- ResourceDeletion
- OutOfServiceTaint
type: string
sharedparameters:
additionalProperties:
type: string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ spec:
according to the node that is fenced, since they are node
specific
type: object
remediationStrategy:
default: ResourceDeletion
description: RemediationStrategy is the remediation method
for unhealthy nodes.
enum:
- ResourceDeletion
- OutOfServiceTaint
type: string
sharedparameters:
additionalProperties:
type: string
Expand Down
33 changes: 27 additions & 6 deletions controllers/fenceagentsremediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,17 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
"fenceAgentActionSucceeded condition", fenceAgentActionSucceededCondition, "succeeded condition", succeededCondition)
}

// remove out-of-service taint when using OutOfServiceTaint remediation
if far.Spec.RemediationStrategy == v1alpha1.OutOfServiceTaintRemediationStrategy {
r.Log.Info("Removing OutOfService taint", "Fence Agent", far.Spec.Agent, "Node Name", req.Name)
if err := utils.RemoveTaint(r.Client, far.Name, utils.CreateOutOfServiceTaint()); err != nil && !apiErrors.IsNotFound(err) {
r.Log.Error(err, "Failed to add out-of-service taint", "CR's Name", req.Name)
return emptyResult, err
}
}

// remove node's taints
if err := utils.RemoveTaint(r.Client, far.Name); err != nil && !apiErrors.IsNotFound(err) {
if err := utils.RemoveTaint(r.Client, far.Name, utils.CreateFARNoExecuteTaint()); err != nil && !apiErrors.IsNotFound(err) {
return emptyResult, err
}
// remove finalizer
Expand All @@ -171,7 +180,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}
// Add FAR (medik8s) remediation taint
r.Log.Info("Try adding FAR (Medik8s) remediation taint", "Fence Agent", far.Spec.Agent, "Node Name", req.Name)
if err := utils.AppendTaint(r.Client, far.Name); err != nil {
if err := utils.AppendTaint(r.Client, far.Name, utils.CreateFARNoExecuteTaint()); err != nil {
return emptyResult, err
}

Expand Down Expand Up @@ -221,10 +230,22 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}
if meta.IsStatusConditionTrue(far.Status.Conditions, v1alpha1.FenceAgentActionSucceededType) &&
!meta.IsStatusConditionTrue(far.Status.Conditions, commonConditions.SucceededType) {
// Fence agent action succeeded, and now we try to remove workloads (pods and their volume attachments)
r.Log.Info("Manual workload deletion", "Fence Agent", far.Spec.Agent, "Node Name", req.Name)
if err := utils.DeleteResources(ctx, r.Client, req.Name); err != nil {
r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name)
if far.Spec.RemediationStrategy == v1alpha1.ResourceDeletionRemediationStrategy {
// Fence agent action succeeded, and now we try to remove workloads (pods and their volume attachments)
r.Log.Info("Manual workload deletion", "Fence Agent", far.Spec.Agent, "Node Name", req.Name)
if err := utils.DeleteResources(ctx, r.Client, req.Name); err != nil {
r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name)
return emptyResult, err
}
} else if far.Spec.RemediationStrategy == v1alpha1.OutOfServiceTaintRemediationStrategy {
r.Log.Info("Adding OutOfService taint", "Fence Agent", far.Spec.Agent, "Node Name", req.Name)
if err := utils.AppendTaint(r.Client, req.Name, utils.CreateOutOfServiceTaint()); err != nil {
r.Log.Error(err, "Failed to add out-of-service taint", "CR's Name", req.Name)
return emptyResult, err
}
} else {
//this should never happen since we enforce valid values with kubebuilder
err := errors.New("unsupported remediation strategy")
return emptyResult, err
}
if err := updateConditions(v1alpha1.RemediationFinishedSuccessfully, far, r.Log); err != nil {
Expand Down
93 changes: 83 additions & 10 deletions controllers/fenceagentsremediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ var _ = Describe("FAR Controller", func() {
}

// default FenceAgentsRemediation CR
underTestFAR := getFenceAgentsRemediation(workerNode, fenceAgentIPMI, testShareParam, testNodeParam)
underTestFAR := getFenceAgentsRemediation(workerNode, fenceAgentIPMI, testShareParam, testNodeParam, v1alpha1.ResourceDeletionRemediationStrategy)

Context("Functionality", func() {
Context("buildFenceAgentParams", func() {
When("FAR include different action than reboot", func() {
It("should succeed with a warning", func() {
invalidValTestFAR := getFenceAgentsRemediation(workerNode, fenceAgentIPMI, invalidShareParam, testNodeParam)
invalidValTestFAR := getFenceAgentsRemediation(workerNode, fenceAgentIPMI, invalidShareParam, testNodeParam, v1alpha1.ResourceDeletionRemediationStrategy)
invalidShareString, err := buildFenceAgentParams(invalidValTestFAR)
Expect(err).NotTo(HaveOccurred())
validShareString, err := buildFenceAgentParams(underTestFAR)
Expand All @@ -121,7 +121,7 @@ var _ = Describe("FAR Controller", func() {
})
})
})
Context("Reconcile", func() {
Context("Reconcile with ResourceDeletion strategy", func() {
nodeKey := client.ObjectKey{Name: workerNode}
farNamespacedName := client.ObjectKey{Name: workerNode, Namespace: defaultNamespace}
farNoExecuteTaint := utils.CreateFARNoExecuteTaint()
Expand All @@ -141,7 +141,7 @@ var _ = Describe("FAR Controller", func() {
Expect(k8sClient.Create(context.Background(), node)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), node)
Expect(k8sClient.Create(context.Background(), underTestFAR)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), underTestFAR)
DeferCleanup(deleteFAR, underTestFAR)
})

// TODO: add more scenarios?
Expand Down Expand Up @@ -177,7 +177,7 @@ var _ = Describe("FAR Controller", func() {
When("creating invalid FAR CR Name", func() {
BeforeEach(func() {
node = utils.GetNode("", workerNode)
underTestFAR = getFenceAgentsRemediation(dummyNode, fenceAgentIPMI, testShareParam, testNodeParam)
underTestFAR = getFenceAgentsRemediation(dummyNode, fenceAgentIPMI, testShareParam, testNodeParam, v1alpha1.ResourceDeletionRemediationStrategy)
})
It("should not have a finalizer nor taint, while the two VAs and one pod will remain", func() {
By("Not finding a matching node to FAR CR's name")
Expand Down Expand Up @@ -209,16 +209,71 @@ var _ = Describe("FAR Controller", func() {
})
})
})
Context("Reconcile with OutOfServiceTaint strategy", func() {
nodeKey := client.ObjectKey{Name: workerNode}
farNamespacedName := client.ObjectKey{Name: workerNode, Namespace: defaultNamespace}
farNoExecuteTaint := utils.CreateFARNoExecuteTaint()
resourceDeletionWasTriggered := true // corresponds to testVADeletion bool value
conditionStatusPointer := func(status metav1.ConditionStatus) *metav1.ConditionStatus { return &status }
BeforeEach(func() {
// Create two VAs and two pods, and at the end clean them up with DeferCleanup
va1 := createVA(vaName1, workerNode)
va2 := createVA(vaName2, workerNode)
testPod := createRunningPod("far-test-1", testPodName, workerNode)
DeferCleanup(cleanupTestedResources, va1, va2, testPod)
farPod := createRunningPod("far-manager-test", farPodName, "")
DeferCleanup(k8sClient.Delete, context.Background(), farPod)
})
JustBeforeEach(func() {
// Create node, and FAR CR, and at the end clean them up with DeferCleanup
Expect(k8sClient.Create(context.Background(), node)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), node)
Expect(k8sClient.Create(context.Background(), underTestFAR)).To(Succeed())
DeferCleanup(deleteFAR, underTestFAR)
})

When("creating valid FAR CR", func() {
BeforeEach(func() {
node = utils.GetNode("", workerNode)
underTestFAR = getFenceAgentsRemediation(workerNode, fenceAgentIPMI, testShareParam, testNodeParam, v1alpha1.OutOfServiceTaintRemediationStrategy)
})
It("should have finalizer, taint, while the two VAs and one pod will be deleted", func() {
By("Searching for remediation taint")
Eventually(func(g Gomega) bool {
g.Expect(k8sClient.Get(context.Background(), nodeKey, node)).To(Succeed())
g.Expect(k8sClient.Get(context.Background(), farNamespacedName, underTestFAR)).To(Succeed())
res, _ := cliCommandsEquality(underTestFAR)
return utils.TaintExists(node.Spec.Taints, &farNoExecuteTaint) && res
}, timeoutFinalizer, pollInterval).Should(BeTrue(), "taint should be added, and command format is correct")

// If taint was added, then definitely the finalizer was added as well
By("Having a finalizer if we have a remediation taint")
Expect(controllerutil.ContainsFinalizer(underTestFAR, v1alpha1.FARFinalizer)).To(BeTrue())

By("Not having any VAs nor the test pod")
testVADeletion(vaName1, resourceDeletionWasTriggered)
testVADeletion(vaName2, resourceDeletionWasTriggered)
testPodDeletion(testPodName, resourceDeletionWasTriggered)

By("Verifying correct conditions for successfull remediation")
Expect(underTestFAR.Status.LastUpdateTime).ToNot(BeNil())
verifyStatusCondition(workerNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(workerNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionTrue))
verifyStatusCondition(workerNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionTrue))
})
})
})
})

// getFenceAgentsRemediation assigns the input to the FenceAgentsRemediation
func getFenceAgentsRemediation(nodeName, agent string, sharedparameters map[v1alpha1.ParameterName]string, nodeparameters map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string) *v1alpha1.FenceAgentsRemediation {
func getFenceAgentsRemediation(nodeName, agent string, sharedparameters map[v1alpha1.ParameterName]string, nodeparameters map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string, strategy v1alpha1.RemediationStrategyType) *v1alpha1.FenceAgentsRemediation {
return &v1alpha1.FenceAgentsRemediation{
ObjectMeta: metav1.ObjectMeta{Name: nodeName, Namespace: defaultNamespace},
Spec: v1alpha1.FenceAgentsRemediationSpec{
Agent: agent,
SharedParameters: sharedparameters,
NodeParameters: nodeparameters,
Agent: agent,
SharedParameters: sharedparameters,
NodeParameters: nodeparameters,
RemediationStrategy: strategy,
},
}
}
Expand Down Expand Up @@ -289,10 +344,28 @@ func cleanupTestedResources(va1, va2 *storagev1.VolumeAttachment, pod *corev1.Po
podTest := &corev1.Pod{}
if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(pod), podTest); err == nil {
log.Info("Cleanup: clean pod", "pod name", podTest.Name)
Expect(k8sClient.Delete(context.Background(), podTest)).To(Succeed())
var grace client.GracePeriodSeconds = 0
Expect(k8sClient.Delete(context.Background(), podTest, grace)).To(Succeed())

EventuallyWithOffset(1, func() bool {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(pod), podTest)
return apierrors.IsNotFound(err)
}, 10*time.Second, 100*time.Millisecond).Should(BeTrue())
}
}

func deleteFAR(far *v1alpha1.FenceAgentsRemediation) {
var grace client.GracePeriodSeconds = 0
var farNamespacedName = client.ObjectKey{Name: workerNode, Namespace: defaultNamespace}

ExpectWithOffset(1, k8sClient.Delete(context.Background(), far, grace)).To(Succeed())

EventuallyWithOffset(1, func() bool {
err := k8sClient.Get(context.Background(), farNamespacedName, far)
return apierrors.IsNotFound(err)
}, 10*time.Second, 100*time.Millisecond).Should(BeTrue())
}

// isEqualStringLists return true if two string lists share the same values
func isEqualStringLists(s1, s2 []string) bool {
sort.Strings(s1)
Expand Down
4 changes: 2 additions & 2 deletions pkg/utils/taint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ var _ = Describe("Utils-taint", func() {
Expect(k8sClient.Get(context.Background(), nodeKey, taintedNode)).To(Succeed())
// control-plane-role taint already exist by GetNode
By("adding medik8s NoSchedule taint")
Expect(AppendTaint(k8sClient, node0)).To(Succeed())
Expect(AppendTaint(k8sClient, node0, CreateFARNoExecuteTaint())).To(Succeed())
Expect(k8sClient.Get(context.Background(), nodeKey, taintedNode)).To(Succeed())
Expect(TaintExists(taintedNode.Spec.Taints, &controlPlaneRoleTaint)).To(BeTrue())
Expect(TaintExists(taintedNode.Spec.Taints, &farNoExecuteTaint)).To(BeTrue())
By("removing medik8s NoSchedule taint")
// We want to see that RemoveTaint only remove the taint it receives
Expect(RemoveTaint(k8sClient, node0)).To(Succeed())
Expect(RemoveTaint(k8sClient, node0, CreateFARNoExecuteTaint())).To(Succeed())
Expect(k8sClient.Get(context.Background(), nodeKey, taintedNode)).To(Succeed())
Expect(TaintExists(taintedNode.Spec.Taints, &controlPlaneRoleTaint)).To(BeTrue())
Expect(TaintExists(taintedNode.Spec.Taints, &farNoExecuteTaint)).To(BeFalse())
Expand Down
14 changes: 10 additions & 4 deletions pkg/utils/taints.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,22 @@ func CreateFARNoExecuteTaint() corev1.Taint {
}
}

// CreateOutOfServiceTaint returns an OutOfService taint
func CreateOutOfServiceTaint() corev1.Taint {
return corev1.Taint{
Key: v1alpha1.OutOfServiceTaintKey,
Effect: corev1.TaintEffectNoExecute,
}
}

// AppendTaint appends new taint to the taint list when it is not present, and returns error if it fails in the process
func AppendTaint(r client.Client, nodeName string) error {
func AppendTaint(r client.Client, nodeName string, taint corev1.Taint) error {
// find node by name
node, err := GetNodeWithName(r, nodeName)
if err != nil {
return err
}

taint := CreateFARNoExecuteTaint()
// check if taint doesn't exist
if TaintExists(node.Spec.Taints, &taint) {
return nil
Expand All @@ -79,14 +86,13 @@ func AppendTaint(r client.Client, nodeName string) error {
}

// RemoveTaint removes taint from the taint list when it is existed, and returns error if it fails in the process
func RemoveTaint(r client.Client, nodeName string) error {
func RemoveTaint(r client.Client, nodeName string, taint corev1.Taint) error {
// find node by name
node, err := GetNodeWithName(r, nodeName)
if err != nil {
return err
}

taint := CreateFARNoExecuteTaint()
// check if taint exist
if !TaintExists(node.Spec.Taints, &taint) {
return nil
Expand Down

0 comments on commit c2e03f9

Please sign in to comment.