diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 67c5e93e..c6015ec7 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -38,7 +38,6 @@ const ( //TODO: try to minimize timeout // eventually parameters - timeoutLogs = "3m0s" timeoutTaint = "2s" // Timeout for checking the FAR taint timeoutReboot = "6m0s" // fencing with fence_aws should be completed within 6 minutes timeoutAfterReboot = "5s" // Timeout for verifying steps after the node has been rebooted @@ -48,56 +47,71 @@ const ( skipOOSREnvVarName = "SKIP_OOST_REMEDIATION_VERIFICATION" ) -var remediationTimes []time.Duration +var ( + stopTesting bool + remediationTimes []time.Duration +) var _ = Describe("FAR E2e", func() { var ( fenceAgent, nodeIdentifierPrefix string testShareParam map[v1alpha1.ParameterName]string testNodeParam map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string - selectedNode *corev1.Node - nodeName string - pod *corev1.Pod - startTime, nodeBootTimeBefore time.Time err error ) - BeforeEach(func() { - // create FAR CR spec based on OCP platformn - clusterPlatform, err := e2eUtils.GetClusterInfo(configClient) - Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform") - log.Info("Begin e2e test", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type)) - - switch clusterPlatform.Status.PlatformStatus.Type { - case configv1.AWSPlatformType: - fenceAgent = fenceAgentAWS - nodeIdentifierPrefix = nodeIdentifierPrefixAWS - By("running fence_aws") - case configv1.BareMetalPlatformType: - fenceAgent = fenceAgentIPMI - nodeIdentifierPrefix = nodeIdentifierPrefixIPMI - By("running fence_ipmilan") - default: - Skip("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)") - } + When("trying to identify cluster platform", func() { + It("should be AWS/BareMetal for finding the needed cluster and node parameters", func() { + // create FAR CR spec based on OCP platformn + clusterPlatform, err := e2eUtils.GetClusterInfo(configClient) + Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform") + log.Info("Getting Cluster Infromation", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type)) + + switch clusterPlatform.Status.PlatformStatus.Type { + case configv1.AWSPlatformType: + fenceAgent = fenceAgentAWS + nodeIdentifierPrefix = nodeIdentifierPrefixAWS + By("running fence_aws") + case configv1.BareMetalPlatformType: + fenceAgent = fenceAgentIPMI + nodeIdentifierPrefix = nodeIdentifierPrefixIPMI + By("running fence_ipmilan") + default: + stopTesting = true // Mark to stop subsequent tests + Fail("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)") + } - testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction) - Expect(err).ToNot(HaveOccurred(), "can't get shared information") - testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type) - Expect(err).ToNot(HaveOccurred(), "can't get node information") + testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction) + Expect(err).ToNot(HaveOccurred(), "can't get shared information") + testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type) + Expect(err).ToNot(HaveOccurred(), "can't get node information") + }) }) // runFARTests is a utility function to run FAR tests. // It accepts a remediation strategy and a condition to determine if the tests should be skipped. runFARTests := func(remediationStrategy v1alpha1.RemediationStrategyType, skipCondition func() bool) { - var availableWorkerNodes *corev1.NodeList + var ( + availableWorkerNodes *corev1.NodeList + selectedNode *corev1.Node + nodeName string + pod *corev1.Pod + startTime, nodeBootTimeBefore time.Time + ) BeforeEach(func() { + if stopTesting { + Skip("Skip testing due to unsupported platform") + } if skipCondition() { Skip("Skip this block due to unsupported condition") } if availableWorkerNodes == nil { - availableWorkerNodes = getAvailableWorkerNodes() + availableWorkerNodes = getReadyWorkerNodes() } + if len(availableWorkerNodes.Items) < 1 { + Fail("There isn't an available (and Ready) worker node in the cluster") + } + selectedNode = pickRemediatedNode(availableWorkerNodes) nodeName = selectedNode.Name printNodeDetails(selectedNode, nodeIdentifierPrefix, testNodeParam) @@ -107,7 +121,7 @@ var _ = Describe("FAR E2e", func() { Expect(err).ToNot(HaveOccurred(), "failed to get boot time of the node") // create tested pod which will be deleted by the far CR - pod = createTestedPod(nodeName, testContainerName) + pod = createTestedPod(nodeName) DeferCleanup(cleanupTestedResources, pod) // set the node as "unhealthy" by disabling kubelet @@ -240,25 +254,31 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha return testNodeParam, nil } -// getAvailableNodes a list of available worker nodes in the cluster -func getAvailableWorkerNodes() *corev1.NodeList { - availableNodes := &corev1.NodeList{} +// getReadyWorkerNodes returns a list of ready worker nodes in the cluster if any +func getReadyWorkerNodes() *corev1.NodeList { + availableWorkerNodes := &corev1.NodeList{} selector := labels.NewSelector() - requirement, _ := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{}) + requirement, err := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{}) + Expect(err).To(BeNil()) selector = selector.Add(*requirement) - Expect(k8sClient.List(context.Background(), availableNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred()) - if len(availableNodes.Items) < 1 { - Fail("No worker nodes found in the cluster") + Expect(k8sClient.List(context.Background(), availableWorkerNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred()) + + // Filter nodes to only include those in "Ready" state + readyWorkerNodes := &corev1.NodeList{} + for _, node := range availableWorkerNodes.Items { + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue { + readyWorkerNodes.Items = append(readyWorkerNodes.Items, node) + break // "Ready" was found + } + } } - return availableNodes + return readyWorkerNodes } // pickRemediatedNode randomly returns a next remediated node from the current available nodes, // and then the node is removed from the list of available nodes func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node { - if len(availableNodes.Items) < 1 { - Fail("No available node found for remediation") - } // Generate a random seed based on the current time r := rand.New(rand.NewSource(time.Now().UnixNano())) // Randomly select a worker node @@ -270,14 +290,14 @@ func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node { } // createTestedPod creates tested pod which will be deleted by the far CR -func createTestedPod(nodeName, containerName string) *corev1.Pod { +func createTestedPod(nodeName string) *corev1.Pod { pod := e2eUtils.GetPod(nodeName, testContainerName) pod.Name = testPodName pod.Namespace = testNsName pod.Spec.Tolerations = []corev1.Toleration{ { Key: v1alpha1.FARNoExecuteTaintKey, - Operator: corev1.TolerationOpEqual, + Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute, }, } @@ -303,7 +323,7 @@ func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.Para SharedParameters: sharedParameters, NodeParameters: nodeParameters, RemediationStrategy: strategy, - RetryCount: 5, + RetryCount: 10, RetryInterval: metav1.Duration{Duration: 20 * time.Second}, Timeout: metav1.Duration{Duration: 60 * time.Second}, }, diff --git a/test/e2e/utils/cluster.go b/test/e2e/utils/cluster.go index 07098066..92e3a8d4 100644 --- a/test/e2e/utils/cluster.go +++ b/test/e2e/utils/cluster.go @@ -48,6 +48,23 @@ func GetSecretData(clientSet *kubernetes.Clientset, secretName, secretNamespace, return string(secret.Data[secretData1]), string(secret.Data[secretData2]), nil } +// getNodeRoleFromMachine return node role "master/control-plane" or "worker" from machine label if present, otherwise "unknown" +func getNodeRoleFromMachine(nodeLabels map[string]string) string { + machineLabelPrefixRole := "machine.openshift.io/cluster-api-machine-" + // look for machine.openshift.io/cluster-api-machine-role or machine.openshift.io/cluster-api-machine-type label + for _, labelKey := range []string{machineLabelPrefixRole + "role", machineLabelPrefixRole + "type"} { + if labelVal, isFound := nodeLabels[labelKey]; isFound { + if labelVal == "worker" { + return "worker" + } + if labelVal == "master" { + return "master/control-plane" + } + } + } + return "unknown" +} + // GetAWSNodeInfoList returns a list of the node names and their identification, e.g., AWS instance ID func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.NodeName]string, error) { // oc get machine -n openshift-machine-api MACHINE_NAME -o jsonpath='{.spec.providerID}' @@ -62,24 +79,28 @@ func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.No } var missNodeMachineErr error + missNodeMachineNames := "" // creates map for nodeName and AWS instance ID for _, machine := range machineList.Items { if machine.Status.NodeRef == nil || machine.Spec.ProviderID == nil { if missNodeMachineErr != nil { - missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing\n%w", machine.Spec.Name, missNodeMachineErr) + missNodeMachineNames += ", " + machine.ObjectMeta.GetName() + missNodeMachineErr = fmt.Errorf("machines %s are not associated with any node or there provider ID is missing", missNodeMachineNames) } else { - missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing", machine.Spec.Name) + missNodeMachineNames = machine.ObjectMeta.GetName() + missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it's provider ID is missing", machine.ObjectMeta.GetName()) } - continue + } else { + nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name) + nodeRole := getNodeRoleFromMachine(machine.Labels) + providerID := *machine.Spec.ProviderID + + // Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2 + splitedProviderID := strings.Split(providerID, "/i-") + instanceID := "i-" + splitedProviderID[1] + nodeList[nodeName] = instanceID + fmt.Printf("node: %s, Role: %s, Instance ID: %s \n", nodeName, nodeRole, instanceID) } - nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name) - providerID := *machine.Spec.ProviderID - - // Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2 - splitedProviderID := strings.Split(providerID, "/i-") - instanceID := "i-" + splitedProviderID[1] - nodeList[nodeName] = instanceID - fmt.Printf("node: %s Instance ID: %s \n", nodeName, instanceID) } return nodeList, missNodeMachineErr } diff --git a/test/e2e/utils/command.go b/test/e2e/utils/command.go index cef9f22d..61ec205e 100644 --- a/test/e2e/utils/command.go +++ b/test/e2e/utils/command.go @@ -193,7 +193,7 @@ func GetPod(nodeName, containerName string) *corev1.Pod { Tolerations: []corev1.Toleration{ { Key: v1alpha1.FARNoExecuteTaintKey, - Operator: corev1.TolerationOpEqual, + Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute, }, {