Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stale Cache on Fast Reconcile #74

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions api/v1alpha1/fenceagentsremediation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,14 @@ type FenceAgentsRemediationStatus struct {
//+optional
// +operator-sdk:csv:customresourcedefinitions:type=status,displayName="conditions",xDescriptors="urn:alm:descriptor:io.kubernetes.conditions"
Conditions []metav1.Condition `json:"conditions,omitempty"`

// LastUpdateTime is the last time the status was updated.
//
//+optional
//+kubebuilder:validation:Type=string
//+kubebuilder:validation:Format=date-time
//+operator-sdk:csv:customresourcedefinitions:type=status
LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"`
}

//+kubebuilder:object:root=true
Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ spec:
path: conditions
x-descriptors:
- urn:alm:descriptor:io.kubernetes.conditions
- description: LastUpdateTime is the last time the status was updated.
displayName: Last Update Time
path: lastUpdateTime
version: v1alpha1
- description: FenceAgentsRemediationTemplate is the Schema for the fenceagentsremediationtemplates
API
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ spec:
x-kubernetes-list-map-keys:
- type
x-kubernetes-list-type: map
lastUpdateTime:
description: LastUpdateTime is the last time the status was updated.
format: date-time
type: string
type: object
type: object
served: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ spec:
x-kubernetes-list-map-keys:
- type
x-kubernetes-list-type: map
lastUpdateTime:
description: LastUpdateTime is the last time the status was updated.
format: date-time
type: string
type: object
type: object
served: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ spec:
path: conditions
x-descriptors:
- urn:alm:descriptor:io.kubernetes.conditions
- description: LastUpdateTime is the last time the status was updated.
displayName: Last Update Time
path: lastUpdateTime
version: v1alpha1
- description: FenceAgentsRemediationTemplate is the Schema for the fenceagentsremediationtemplates
API
Expand Down
61 changes: 45 additions & 16 deletions controllers/fenceagentsremediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilErrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/wait"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
Expand Down Expand Up @@ -85,7 +86,9 @@ func (r *FenceAgentsRemediationReconciler) SetupWithManager(mgr ctrl.Manager) er
func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (finalResult ctrl.Result, finalErr error) {
r.Log.Info("Begin FenceAgentsRemediation Reconcile")
defer r.Log.Info("Finish FenceAgentsRemediation Reconcile")
// Reconcile requeue results
emptyResult := ctrl.Result{}
requeueImmediately := ctrl.Result{Requeue: true}

// Fetch the FenceAgentsRemediation instance
far := &v1alpha1.FenceAgentsRemediation{}
Expand All @@ -100,13 +103,13 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
return emptyResult, err
}

// At the end of each Reconcile try to update CR's status
// At the end of each Reconcile we try to update CR's status
defer func() {
if updateErr := r.updateStatus(ctx, far); updateErr != nil {
if !apiErrors.IsConflict(updateErr) {
finalErr = utilErrors.NewAggregate([]error{updateErr, finalErr})
if apiErrors.IsConflict(updateErr) {
r.Log.Info("Conflict has occurred on updating the CR status")
}
finalResult.RequeueAfter = time.Second
finalErr = utilErrors.NewAggregate([]error{updateErr, finalErr})
}
}()

Expand All @@ -119,14 +122,14 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}
if !valid {
r.Log.Error(err, "Didn't find a node matching the CR's name", "CR's Name", req.Name)
err := updateConditions(v1alpha1.RemediationFinishedNodeNotFound, &far.Status.Conditions, r.Log)
err := updateConditions(v1alpha1.RemediationFinishedNodeNotFound, far, r.Log)
return emptyResult, err
}

// Check NHC timeout annotation
if isTimedOutByNHC(far) {
r.Log.Info("FAR remediation was stopped by Node Healthcheck Operator")
err := updateConditions(v1alpha1.RemediationInterruptedByNHC, &far.Status.Conditions, r.Log)
err := updateConditions(v1alpha1.RemediationInterruptedByNHC, far, r.Log)
return emptyResult, err
}

Expand All @@ -138,10 +141,10 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}
r.Log.Info("Finalizer was added", "CR Name", req.Name)

if err := updateConditions(v1alpha1.RemediationStarted, &far.Status.Conditions, r.Log); err != nil {
if err := updateConditions(v1alpha1.RemediationStarted, far, r.Log); err != nil {
return emptyResult, err
}
return ctrl.Result{Requeue: true}, nil
return requeueImmediately, nil
} else if controllerutil.ContainsFinalizer(far, v1alpha1.FARFinalizer) && !far.ObjectMeta.DeletionTimestamp.IsZero() {
// Delete CR only when a finalizer and DeletionTimestamp are set
r.Log.Info("CR's deletion timestamp is not zero, and FAR finalizer exists", "CR Name", req.Name)
Expand Down Expand Up @@ -211,10 +214,10 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}

r.Log.Info("Fence Agent command was finished successfully", "Fence Agent", far.Spec.Agent, "Node name", req.Name, "Response", SuccessFAResponse)
if err := updateConditions(v1alpha1.FenceAgentSucceeded, &far.Status.Conditions, r.Log); err != nil {
if err := updateConditions(v1alpha1.FenceAgentSucceeded, far, r.Log); err != nil {
return emptyResult, err
}
return ctrl.Result{Requeue: true}, nil
return requeueImmediately, nil
}
if meta.IsStatusConditionTrue(far.Status.Conditions, v1alpha1.FenceAgentActionSucceededType) &&
!meta.IsStatusConditionTrue(far.Status.Conditions, commonConditions.SucceededType) {
Expand All @@ -224,8 +227,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name)
return emptyResult, err
}

if err := updateConditions(v1alpha1.RemediationFinishedSuccessfully, &far.Status.Conditions, r.Log); err != nil {
if err := updateConditions(v1alpha1.RemediationFinishedSuccessfully, far, r.Log); err != nil {
return emptyResult, err
}
r.Log.Info("FenceAgentsRemediation CR has completed to remediate the node", "Node Name", req.Name)
Expand All @@ -247,27 +249,46 @@ func isTimedOutByNHC(far *v1alpha1.FenceAgentsRemediation) bool {
func (r *FenceAgentsRemediationReconciler) updateStatus(ctx context.Context, far *v1alpha1.FenceAgentsRemediation) error {
if err := r.Client.Status().Update(ctx, far); err != nil {
if !apiErrors.IsConflict(err) {
razo7 marked this conversation as resolved.
Show resolved Hide resolved
r.Log.Error(err, "failed to update far status")
r.Log.Error(err, "failed to update far status in case on a non conflict")
}
return err
}
// Wait until the cache is updated in order to prevent reading a stale status in the next reconcile
// and making wrong decisions based on it.
pollingTimeout := 5 * time.Second
pollErr := wait.PollUntilContextTimeout(ctx, 200*time.Millisecond, pollingTimeout, true, func(ctx context.Context) (bool, error) {
tmpFar := &v1alpha1.FenceAgentsRemediation{}
if err := r.Client.Get(ctx, client.ObjectKeyFromObject(far), tmpFar); err != nil {
if apiErrors.IsNotFound(err) {
// nothing we can do anymore
return true, nil
}
return false, nil
}
return tmpFar.Status.LastUpdateTime != nil && (tmpFar.Status.LastUpdateTime.Equal(far.Status.LastUpdateTime) || tmpFar.Status.LastUpdateTime.After(far.Status.LastUpdateTime.Time)), nil
})
if pollErr != nil {
return fmt.Errorf("failed to wait for updated cache to be updated in status update after %f seconds of timeout - %w", pollingTimeout.Seconds(), pollErr)
}
return nil
}

// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ConditionsChangeReason.
// return an error if an unknown ConditionsChangeReason is provided
func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error {
func updateConditions(reason v1alpha1.ConditionsChangeReason, far *v1alpha1.FenceAgentsRemediation, log logr.Logger) error {

var (
processingConditionStatus, fenceAgentActionSucceededConditionStatus, succeededConditionStatus metav1.ConditionStatus
conditionMessage string
)
currentConditions := &far.Status.Conditions
conditionHasBeenChanged := false

// RemediationFinishedNodeNotFound and RemediationInterruptedByNHC reasons can happen at any time the Reconcile runs
// Except these two reasons, there are another three reasons that can only happen one after another
// RemediationStarted will always be the first reason (out of these three)
// FenceAgentSucceeded can only happen after RemediationStarted happened
// RemediationFinishedSuccessfully can only happen after FenceAgentSucceeded happened

switch reason {
case v1alpha1.RemediationFinishedNodeNotFound, v1alpha1.RemediationInterruptedByNHC:
processingConditionStatus = metav1.ConditionFalse
Expand Down Expand Up @@ -302,7 +323,6 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions
return err
}

log.Info("updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason))
// if the requested Status.Conditions.Processing is different then the current one, then update Status.Conditions.Processing value
if processingConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.ProcessingType, processingConditionStatus) {
meta.SetStatusCondition(currentConditions, metav1.Condition{
Expand All @@ -311,6 +331,7 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions
Reason: string(reason),
Message: conditionMessage,
})
conditionHasBeenChanged = true
}

// if the requested Status.Conditions.FenceAgentActionSucceeded is different then the current one, then update Status.Conditions.FenceAgentActionSucceeded value
Expand All @@ -321,6 +342,7 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions
Reason: string(reason),
Message: conditionMessage,
})
conditionHasBeenChanged = true
}

// if the requested Status.Conditions.Succeeded is different then the current one, then update Status.Conditions.Succeeded value
Expand All @@ -331,7 +353,14 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions
Reason: string(reason),
Message: conditionMessage,
})
conditionHasBeenChanged = true
}
// Only update lastUpdate when there were other changes
if conditionHasBeenChanged {
now := metav1.Now()
far.Status.LastUpdateTime = &now
}
log.Info("Updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason), "LastUpdateTime", far.Status.LastUpdateTime)

return nil
}
Expand Down
17 changes: 10 additions & 7 deletions controllers/fenceagentsremediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ const (
vaName2 = "va-test-2"

// intervals
timeoutDeletion = 10 * time.Second // this timeout is used after all the other steps have finished successfully
pollInterval = 250 * time.Millisecond
timeoutDeletion = 2 * time.Second // this timeout is used after all the other steps have finished successfully
timeoutFinalizer = 1 * time.Second
pollInterval = 250 * time.Millisecond
)

var (
Expand Down Expand Up @@ -155,7 +156,7 @@ var _ = Describe("FAR Controller", func() {
g.Expect(k8sClient.Get(context.Background(), farNamespacedName, underTestFAR)).To(Succeed())
res, _ := cliCommandsEquality(underTestFAR)
return utils.TaintExists(node.Spec.Taints, &farNoExecuteTaint) && res
}, 100*time.Millisecond, 10*time.Millisecond).Should(BeTrue(), "taint should be added, and command format is correct")
}, timeoutFinalizer, pollInterval).Should(BeTrue(), "taint should be added, and command format is correct")

// If taint was added, then definitely the finalizer was added as well
By("Having a finalizer if we have a remediation taint")
Expand All @@ -166,7 +167,8 @@ var _ = Describe("FAR Controller", func() {
testVADeletion(vaName2, resourceDeletionWasTriggered)
testPodDeletion(testPodName, resourceDeletionWasTriggered)

By("Having Succeeded, FenceAgentActionSucceeded conditions set to true, and Processing set to false")
By("Verifying correct conditions for successfull remediation")
Expect(underTestFAR.Status.LastUpdateTime).ToNot(BeNil())
verifyStatusCondition(workerNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(workerNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionTrue))
verifyStatusCondition(workerNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionTrue))
Expand All @@ -184,10 +186,10 @@ var _ = Describe("FAR Controller", func() {

By("Not having finalizer")
farNamespacedName.Name = underTestFAR.Name
Eventually(func(g Gomega) bool {
Consistently(func(g Gomega) bool {
razo7 marked this conversation as resolved.
Show resolved Hide resolved
g.Expect(k8sClient.Get(context.Background(), farNamespacedName, underTestFAR)).To(Succeed())
return controllerutil.ContainsFinalizer(underTestFAR, v1alpha1.FARFinalizer)
}, 100*time.Millisecond, 10*time.Millisecond).Should(BeFalse(), "finalizer shouldn't be added")
}, timeoutFinalizer, pollInterval).Should(BeFalse(), "finalizer shouldn't be added")

// If finalizer is missing, then a taint shouldn't be existed
By("Not having remediation taint")
Expand All @@ -199,7 +201,8 @@ var _ = Describe("FAR Controller", func() {
testVADeletion(vaName2, resourceDeletionWasTriggered)
testPodDeletion(testPodName, resourceDeletionWasTriggered)

By("Having all three conditions set to false")
By("Verifying correct conditions for unsuccessfull remediation")
Expect(underTestFAR.Status.LastUpdateTime).ToNot(BeNil())
verifyStatusCondition(dummyNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(dummyNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionFalse))
Expand Down