Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable out-of-service taint in FAR #92

Merged
merged 5 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions api/v1alpha1/fenceagentsremediation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,14 @@ const (
FenceAgentSucceeded ConditionsChangeReason = "FenceAgentSucceeded"
// RemediationFinishedSuccessfully - The unhealthy node was fully remediated/fenced (it was tainted, fenced by FA and all of its resources have been deleted)
RemediationFinishedSuccessfully ConditionsChangeReason = "RemediationFinishedSuccessfully"

ResourceDeletionRemediationStrategy = RemediationStrategyType("ResourceDeletion")
OutOfServiceTaintRemediationStrategy = RemediationStrategyType("OutOfServiceTaint")
)

type ParameterName string
type NodeName string
type RemediationStrategyType string

// FenceAgentsRemediationSpec defines the desired state of FenceAgentsRemediation
type FenceAgentsRemediationSpec struct {
Expand Down Expand Up @@ -84,6 +88,15 @@ type FenceAgentsRemediationSpec struct {
// NodeParameters are passed to the fencing agent according to the node that is fenced, since they are node specific
//+operator-sdk:csv:customresourcedefinitions:type=spec
NodeParameters map[ParameterName]map[NodeName]string `json:"nodeparameters,omitempty"`

// RemediationStrategy is the remediation method for unhealthy nodes.
// Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion".
// ResourceDeletion will iterate over all pods related to the unhealthy node and delete them.
// OutOfServiceTaint will add the out-of-service taint which is a new well-known taint "node.kubernetes.io/out-of-service"
// that enables automatic deletion of pv-attached pods on failed nodes, "out-of-service" taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version 4.13+.
// +kubebuilder:default:="ResourceDeletion"
// +kubebuilder:validation:Enum=ResourceDeletion;OutOfServiceTaint
RemediationStrategy RemediationStrategyType `json:"remediationStrategy,omitempty"`
}

// FenceAgentsRemediationStatus defines the observed state of FenceAgentsRemediation
Expand Down
24 changes: 22 additions & 2 deletions api/v1alpha1/fenceagentsremediation_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ var (
webhookFARLog = logf.Log.WithName("fenceagentsremediation-resource")
// verify agent existence with os.Stat function
agentValidator = validation.NewAgentValidator()
// isOutOfServiceTaintSupported will be set to true in case out-of-service taint is supported (k8s 1.26 or higher)
isOutOfServiceTaintSupported bool
)

func (r *FenceAgentsRemediation) SetupWebhookWithManager(mgr ctrl.Manager) error {
Expand All @@ -53,13 +55,13 @@ var _ webhook.Validator = &FenceAgentsRemediation{}
// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
func (far *FenceAgentsRemediation) ValidateCreate() (admission.Warnings, error) {
webhookFARLog.Info("validate create", "name", far.Name)
return validateAgentName(far.Spec.Agent)
return validateFAR(&far.Spec)
}

// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type
func (far *FenceAgentsRemediation) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
webhookFARLog.Info("validate update", "name", far.Name)
return validateAgentName(far.Spec.Agent)
return validateFAR(&far.Spec)
}

// ValidateDelete implements webhook.Validator so a webhook will be registered for the type
Expand All @@ -68,6 +70,17 @@ func (far *FenceAgentsRemediation) ValidateDelete() (admission.Warnings, error)
return nil, nil
}

func validateFAR(farSpec *FenceAgentsRemediationSpec) (admission.Warnings, error) {
if _, err := validateAgentName(farSpec.Agent); err != nil {
return nil, err
}
return validateStrategy(farSpec.RemediationStrategy)
}

func InitOutOfServiceTaintSupportedFlag(outOfServiceTaintSupported bool) {
slintes marked this conversation as resolved.
Show resolved Hide resolved
isOutOfServiceTaintSupported = outOfServiceTaintSupported
}

func validateAgentName(agent string) (admission.Warnings, error) {
exists, err := agentValidator.ValidateAgentName(agent)
if err != nil {
Expand All @@ -78,3 +91,10 @@ func validateAgentName(agent string) (admission.Warnings, error) {
}
return nil, nil
}

func validateStrategy(farRemStrategy RemediationStrategyType) (admission.Warnings, error) {
if farRemStrategy == OutOfServiceTaintRemediationStrategy && !isOutOfServiceTaintSupported {
return nil, fmt.Errorf("%s remediation strategy is not supported at kubernetes version lower than 1.26, please use a different remediation strategy", OutOfServiceTaintRemediationStrategy)
}
return nil, nil
}
77 changes: 66 additions & 11 deletions api/v1alpha1/fenceagentsremediation_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,41 @@ var _ = Describe("FenceAgentsRemediation Validation", func() {
When("agent name match format and binary", func() {
It("should be accepted", func() {
far := getTestFAR(validAgentName)
_, err := far.ValidateCreate()
Expect(err).ToNot(HaveOccurred())
Expect(far.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})

When("agent name was not found ", func() {
It("should be rejected", func() {
far := getTestFAR(invalidAgentName)
_, err := far.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("unsupported fence agent: %s", invalidAgentName))
Expect(far.ValidateCreate()).Error().To(MatchError(ContainSubstring("unsupported fence agent: %s", invalidAgentName)))
})
})

Context("with OutOfServiceTaint strategy", func() {
razo7 marked this conversation as resolved.
Show resolved Hide resolved
var outOfServiceStrategy *FenceAgentsRemediation

BeforeEach(func() {
orgValue := isOutOfServiceTaintSupported
DeferCleanup(func() { isOutOfServiceTaintSupported = orgValue })

outOfServiceStrategy = getFAR(validAgentName, OutOfServiceTaintRemediationStrategy)
})
When("out of service taint is supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = true
})
It("should be allowed", func() {
Expect(outOfServiceStrategy.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})
When("out of service taint is not supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = false
})
It("should be denied", func() {
Expect(outOfServiceStrategy.ValidateCreate()).Error().To(MatchError(ContainSubstring(outOfServiceTaintUnsupportedMsg)))
})
})
})
})
Expand All @@ -37,8 +61,7 @@ var _ = Describe("FenceAgentsRemediation Validation", func() {
})
It("should be accepted", func() {
far := getTestFAR(validAgentName)
_, err := far.ValidateUpdate(oldFAR)
Expect(err).ToNot(HaveOccurred())
Expect(far.ValidateUpdate(oldFAR)).Error().NotTo(HaveOccurred())
})
})

Expand All @@ -48,21 +71,53 @@ var _ = Describe("FenceAgentsRemediation Validation", func() {
})
It("should be rejected", func() {
far := getTestFAR(invalidAgentName)
_, err := far.ValidateUpdate(oldFAR)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("unsupported fence agent: %s", invalidAgentName))
Expect(far.ValidateUpdate(oldFAR)).Error().To(MatchError(ContainSubstring("unsupported fence agent: %s", invalidAgentName)))
})
})

Context("with OutOfServiceTaint strategy", func() {
var outOfServiceStrategy *FenceAgentsRemediation
var resourceDeletionStrategy *FenceAgentsRemediation

BeforeEach(func() {
orgValue := isOutOfServiceTaintSupported
DeferCleanup(func() { isOutOfServiceTaintSupported = orgValue })

outOfServiceStrategy = getFAR(validAgentName, OutOfServiceTaintRemediationStrategy)
resourceDeletionStrategy = getFAR(validAgentName, ResourceDeletionRemediationStrategy)
})
When("out of service taint is supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = true
})
It("should be allowed", func() {
Expect(outOfServiceStrategy.ValidateUpdate(resourceDeletionStrategy)).Error().NotTo(HaveOccurred())
})
})
When("out of service taint is not supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = false
})
It("should be denied", func() {
Expect(outOfServiceStrategy.ValidateUpdate(resourceDeletionStrategy)).Error().To(MatchError(ContainSubstring(outOfServiceTaintUnsupportedMsg)))
})
})
})
})
})

func getTestFAR(agentName string) *FenceAgentsRemediation {
return getFAR(agentName, ResourceDeletionRemediationStrategy)
}

func getFAR(agentName string, strategy RemediationStrategyType) *FenceAgentsRemediation {
return &FenceAgentsRemediation{
ObjectMeta: metav1.ObjectMeta{
Name: "test-" + agentName,
},
Spec: FenceAgentsRemediationSpec{
Agent: agentName,
Agent: agentName,
RemediationStrategy: strategy,
},
}
}
4 changes: 2 additions & 2 deletions api/v1alpha1/fenceagentsremediationtemplate_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ var _ webhook.Validator = &FenceAgentsRemediationTemplate{}
// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
func (farTemplate *FenceAgentsRemediationTemplate) ValidateCreate() (admission.Warnings, error) {
webhookFARTemplateLog.Info("validate create", "name", farTemplate.Name)
return validateAgentName(farTemplate.Spec.Template.Spec.Agent)
return validateFAR(&farTemplate.Spec.Template.Spec)
}

// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type
func (farTemplate *FenceAgentsRemediationTemplate) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
webhookFARTemplateLog.Info("validate update", "name", farTemplate.Name)
return validateAgentName(farTemplate.Spec.Template.Spec.Agent)
return validateFAR(&farTemplate.Spec.Template.Spec)
}

// ValidateDelete implements webhook.Validator so a webhook will be registered for the type
Expand Down
81 changes: 70 additions & 11 deletions api/v1alpha1/fenceagentsremediationtemplate_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,43 @@ var _ = Describe("FenceAgentsRemediationTemplate Validation", func() {
When("agent name match format and binary", func() {
It("should be accepted", func() {
farTemplate := getTestFARTemplate(validAgentName)
_, err := farTemplate.ValidateCreate()
Expect(err).ToNot(HaveOccurred())
Expect(farTemplate.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})

When("agent name was not found ", func() {
It("should be rejected", func() {
farTemplate := getTestFARTemplate(invalidAgentName)
_, err := farTemplate.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("unsupported fence agent: %s", invalidAgentName))
Expect(farTemplate.ValidateCreate()).Error().To(MatchError(ContainSubstring("unsupported fence agent: %s", invalidAgentName)))
})
})

Context("with OutOfServiceTaint strategy", func() {
var outOfServiceStrategy *FenceAgentsRemediationTemplate

BeforeEach(func() {
orgValue := isOutOfServiceTaintSupported
DeferCleanup(func() { isOutOfServiceTaintSupported = orgValue })

outOfServiceStrategy = getFARTemplate(validAgentName, OutOfServiceTaintRemediationStrategy)
})

When("out of service taint is supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = true
})
It("should be allowed", func() {
Expect(outOfServiceStrategy.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})

When("out of service taint is not supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = false
})
It("should be denied", func() {
Expect(outOfServiceStrategy.ValidateCreate()).Error().To(MatchError(ContainSubstring(outOfServiceTaintUnsupportedMsg)))
})
})
})
})
Expand All @@ -37,8 +63,7 @@ var _ = Describe("FenceAgentsRemediationTemplate Validation", func() {
})
It("should be accepted", func() {
farTemplate := getTestFARTemplate(validAgentName)
_, err := farTemplate.ValidateUpdate(oldFARTemplate)
Expect(err).ToNot(HaveOccurred())
Expect(farTemplate.ValidateUpdate(oldFARTemplate)).Error().NotTo(HaveOccurred())
})
})

Expand All @@ -48,23 +73,57 @@ var _ = Describe("FenceAgentsRemediationTemplate Validation", func() {
})
It("should be rejected", func() {
farTemplate := getTestFARTemplate(invalidAgentName)
_, err := farTemplate.ValidateUpdate(oldFARTemplate)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("unsupported fence agent: %s", invalidAgentName))
Expect(farTemplate.ValidateUpdate(oldFARTemplate)).Error().To(MatchError(ContainSubstring("unsupported fence agent: %s", invalidAgentName)))
})
})

Context("with OutOfServiceTaint strategy", func() {
var outOfServiceStrategy *FenceAgentsRemediationTemplate
var resourceDeletionStrategy *FenceAgentsRemediationTemplate

BeforeEach(func() {
orgValue := isOutOfServiceTaintSupported
DeferCleanup(func() { isOutOfServiceTaintSupported = orgValue })

outOfServiceStrategy = getFARTemplate(validAgentName, OutOfServiceTaintRemediationStrategy)
resourceDeletionStrategy = getFARTemplate(validAgentName, ResourceDeletionRemediationStrategy)
})

When("out of service taint is supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = true
})
It("should be allowed", func() {
Expect(outOfServiceStrategy.ValidateUpdate(resourceDeletionStrategy)).Error().NotTo(HaveOccurred())
})
})

When("out of service taint is not supported", func() {
BeforeEach(func() {
isOutOfServiceTaintSupported = false
})
It("should be denied", func() {
Expect(outOfServiceStrategy.ValidateUpdate(resourceDeletionStrategy)).Error().To(MatchError(ContainSubstring(outOfServiceTaintUnsupportedMsg)))
})
})
})
})
})

func getTestFARTemplate(agentName string) *FenceAgentsRemediationTemplate {
return getFARTemplate(agentName, ResourceDeletionRemediationStrategy)
}

func getFARTemplate(agentName string, strategy RemediationStrategyType) *FenceAgentsRemediationTemplate {
return &FenceAgentsRemediationTemplate{
ObjectMeta: metav1.ObjectMeta{
Name: "test-" + agentName + "-template",
},
Spec: FenceAgentsRemediationTemplateSpec{
Template: FenceAgentsRemediationTemplateResource{
Spec: FenceAgentsRemediationSpec{
Agent: agentName,
Agent: agentName,
RemediationStrategy: strategy,
},
},
},
Expand Down
5 changes: 3 additions & 2 deletions api/v1alpha1/webhook_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ import (
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.

const (
validAgentName = "fence_ipmilan"
invalidAgentName = "fence_ip"
validAgentName = "fence_ipmilan"
invalidAgentName = "fence_ip"
outOfServiceTaintUnsupportedMsg = "OutOfServiceTaint remediation strategy is not supported at kubernetes version lower than 1.26, please use a different remediation strategy"
)

var (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ spec:
description: NodeParameters are passed to the fencing agent according
to the node that is fenced, since they are node specific
type: object
remediationStrategy:
default: ResourceDeletion
description: |-
RemediationStrategy is the remediation method for unhealthy nodes.
Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion".
ResourceDeletion will iterate over all pods related to the unhealthy node and delete them.
OutOfServiceTaint will add the out-of-service taint which is a new well-known taint "node.kubernetes.io/out-of-service"
that enables automatic deletion of pv-attached pods on failed nodes, "out-of-service" taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version 4.13+.
enum:
- ResourceDeletion
- OutOfServiceTaint
type: string
retrycount:
default: 5
description: RetryCount is the number of times the fencing agent will
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,18 @@ spec:
according to the node that is fenced, since they are node
specific
type: object
remediationStrategy:
default: ResourceDeletion
description: |-
RemediationStrategy is the remediation method for unhealthy nodes.
Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion".
ResourceDeletion will iterate over all pods related to the unhealthy node and delete them.
OutOfServiceTaint will add the out-of-service taint which is a new well-known taint "node.kubernetes.io/out-of-service"
that enables automatic deletion of pv-attached pods on failed nodes, "out-of-service" taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version 4.13+.
enum:
- ResourceDeletion
- OutOfServiceTaint
type: string
retrycount:
default: 5
description: RetryCount is the number of times the fencing
Expand Down
Loading
Loading