mirror of
https://github.com/optim-enterprises-bv/kubernetes.git
synced 2025-12-04 15:15:36 +00:00
Merge pull request #127055 from dims/fix-etcd-failures-in-ci-kubernetes-e2e-cos-gce-disruptive-canary
Fix etcd failures in ci-kubernetes-e2e-cos-gce-disruptive-canary
This commit is contained in:
@@ -34,6 +34,7 @@ import (
|
|||||||
admissionapi "k8s.io/pod-security-admission/api"
|
admissionapi "k8s.io/pod-security-admission/api"
|
||||||
|
|
||||||
"github.com/onsi/ginkgo/v2"
|
"github.com/onsi/ginkgo/v2"
|
||||||
|
"github.com/onsi/gomega"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
|
var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
|
||||||
@@ -47,7 +48,7 @@ var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
|
|||||||
// - master access
|
// - master access
|
||||||
// ... so the provider check should be identical to the intersection of
|
// ... so the provider check should be identical to the intersection of
|
||||||
// providers that provide those capabilities.
|
// providers that provide those capabilities.
|
||||||
e2eskipper.SkipUnlessProviderIs("gce")
|
e2eskipper.SkipUnlessProviderIs("gce", "aws")
|
||||||
e2eskipper.SkipUnlessSSHKeyPresent()
|
e2eskipper.SkipUnlessSSHKeyPresent()
|
||||||
|
|
||||||
err := e2erc.RunRC(ctx, testutils.RCConfig{
|
err := e2erc.RunRC(ctx, testutils.RCConfig{
|
||||||
@@ -80,7 +81,7 @@ var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
|
|||||||
})
|
})
|
||||||
|
|
||||||
func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) {
|
func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) {
|
||||||
doEtcdFailure(ctx, failCommand, fixCommand)
|
doEtcdFailure(ctx, f, failCommand, fixCommand)
|
||||||
|
|
||||||
checkExistingRCRecovers(ctx, f)
|
checkExistingRCRecovers(ctx, f)
|
||||||
|
|
||||||
@@ -94,17 +95,30 @@ func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixC
|
|||||||
// master and go on to assert that etcd and kubernetes components recover.
|
// master and go on to assert that etcd and kubernetes components recover.
|
||||||
const etcdFailureDuration = 20 * time.Second
|
const etcdFailureDuration = 20 * time.Second
|
||||||
|
|
||||||
func doEtcdFailure(ctx context.Context, failCommand, fixCommand string) {
|
func doEtcdFailure(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) {
|
||||||
ginkgo.By("failing etcd")
|
ginkgo.By("failing etcd")
|
||||||
|
|
||||||
masterExec(ctx, failCommand)
|
masterExec(ctx, f, failCommand)
|
||||||
time.Sleep(etcdFailureDuration)
|
time.Sleep(etcdFailureDuration)
|
||||||
masterExec(ctx, fixCommand)
|
masterExec(ctx, f, fixCommand)
|
||||||
}
|
}
|
||||||
|
|
||||||
func masterExec(ctx context.Context, cmd string) {
|
func masterExec(ctx context.Context, f *framework.Framework, cmd string) {
|
||||||
host := framework.APIAddress() + ":22"
|
nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
|
||||||
|
// checks if there is at least one control-plane node
|
||||||
|
|
||||||
|
gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(),
|
||||||
|
"at least one node with label %s should exist.", framework.ControlPlaneLabel)
|
||||||
|
|
||||||
|
ips := framework.GetNodeExternalIPs(&nodes.Items[0])
|
||||||
|
gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.")
|
||||||
|
|
||||||
|
host := ips[0] + ":22"
|
||||||
result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider)
|
result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
e2essh.LogResult(result)
|
||||||
|
|
||||||
|
result, err = e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider)
|
||||||
framework.ExpectNoError(err, "failed to SSH to host %s on provider %s and run command: %q", host, framework.TestContext.Provider, cmd)
|
framework.ExpectNoError(err, "failed to SSH to host %s on provider %s and run command: %q", host, framework.TestContext.Provider, cmd)
|
||||||
if result.Code != 0 {
|
if result.Code != 0 {
|
||||||
e2essh.LogResult(result)
|
e2essh.LogResult(result)
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ package apps
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/onsi/gomega"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -40,7 +41,6 @@ import (
|
|||||||
e2erc "k8s.io/kubernetes/test/e2e/framework/rc"
|
e2erc "k8s.io/kubernetes/test/e2e/framework/rc"
|
||||||
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
||||||
e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
|
e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
|
||||||
testfwk "k8s.io/kubernetes/test/integration/framework"
|
|
||||||
testutils "k8s.io/kubernetes/test/utils"
|
testutils "k8s.io/kubernetes/test/utils"
|
||||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||||
admissionapi "k8s.io/pod-security-admission/api"
|
admissionapi "k8s.io/pod-security-admission/api"
|
||||||
@@ -278,12 +278,18 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() {
|
|||||||
|
|
||||||
// Requires master ssh access.
|
// Requires master ssh access.
|
||||||
e2eskipper.SkipUnlessProviderIs("gce", "aws")
|
e2eskipper.SkipUnlessProviderIs("gce", "aws")
|
||||||
nodes, err := getControlPlaneNodes(ctx, f.ClientSet)
|
nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
|
||||||
framework.ExpectNoError(err)
|
|
||||||
|
// checks if there is at least one control-plane node
|
||||||
|
gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist.", framework.ControlPlaneLabel)
|
||||||
|
|
||||||
for i := range nodes.Items {
|
for i := range nodes.Items {
|
||||||
|
|
||||||
|
ips := framework.GetNodeExternalIPs(&nodes.Items[i])
|
||||||
|
gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.")
|
||||||
|
|
||||||
restarter := NewRestartConfig(
|
restarter := NewRestartConfig(
|
||||||
getFirstIPforNode(&nodes.Items[i]), "kube-controller", ports.KubeControllerManagerPort, restartPollInterval, restartTimeout, true)
|
ips[0], "kube-controller", ports.KubeControllerManagerPort, restartPollInterval, restartTimeout, true)
|
||||||
restarter.restart(ctx)
|
restarter.restart(ctx)
|
||||||
|
|
||||||
// The intent is to ensure the replication controller manager has observed and reported status of
|
// The intent is to ensure the replication controller manager has observed and reported status of
|
||||||
@@ -313,11 +319,17 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() {
|
|||||||
ginkgo.It("Scheduler should continue assigning pods to nodes across restart", func(ctx context.Context) {
|
ginkgo.It("Scheduler should continue assigning pods to nodes across restart", func(ctx context.Context) {
|
||||||
// Requires master ssh access.
|
// Requires master ssh access.
|
||||||
e2eskipper.SkipUnlessProviderIs("gce", "aws")
|
e2eskipper.SkipUnlessProviderIs("gce", "aws")
|
||||||
nodes, err := getControlPlaneNodes(ctx, f.ClientSet)
|
nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
|
||||||
framework.ExpectNoError(err)
|
|
||||||
|
// checks if there is at least one control-plane node
|
||||||
|
gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist.", framework.ControlPlaneLabel)
|
||||||
|
|
||||||
for i := range nodes.Items {
|
for i := range nodes.Items {
|
||||||
|
ips := framework.GetNodeExternalIPs(&nodes.Items[i])
|
||||||
|
gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.")
|
||||||
|
|
||||||
restarter := NewRestartConfig(
|
restarter := NewRestartConfig(
|
||||||
getFirstIPforNode(&nodes.Items[i]), "kube-scheduler", kubeschedulerconfig.DefaultKubeSchedulerPort, restartPollInterval, restartTimeout, true)
|
ips[0], "kube-scheduler", kubeschedulerconfig.DefaultKubeSchedulerPort, restartPollInterval, restartTimeout, true)
|
||||||
|
|
||||||
// Create pods while the scheduler is down and make sure the scheduler picks them up by
|
// Create pods while the scheduler is down and make sure the scheduler picks them up by
|
||||||
// scaling the rc to the same size.
|
// scaling the rc to the same size.
|
||||||
@@ -367,42 +379,3 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
func getFirstIPforNode(node *v1.Node) string {
|
|
||||||
var ips []string
|
|
||||||
ips = append(ips, getAddresses(node, v1.NodeExternalIP)...)
|
|
||||||
if len(ips) == 0 {
|
|
||||||
// If ExternalIP isn't set, assume the test programs can reach the InternalIP
|
|
||||||
ips = append(ips, getAddresses(node, v1.NodeInternalIP)...)
|
|
||||||
}
|
|
||||||
if len(ips) == 0 {
|
|
||||||
framework.Failf("did not find any ip(s) for node: %v", node)
|
|
||||||
}
|
|
||||||
return ips[0]
|
|
||||||
}
|
|
||||||
|
|
||||||
func getAddresses(node *v1.Node, addressType v1.NodeAddressType) (ips []string) {
|
|
||||||
for j := range node.Status.Addresses {
|
|
||||||
nodeAddress := &node.Status.Addresses[j]
|
|
||||||
if nodeAddress.Type == addressType && nodeAddress.Address != "" {
|
|
||||||
ips = append(ips, nodeAddress.Address)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
func getControlPlaneNodes(ctx context.Context, c clientset.Interface) (nodes *v1.NodeList, err error) {
|
|
||||||
nodes, err = c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
testfwk.Filter(nodes, func(node v1.Node) bool {
|
|
||||||
_, isMaster := node.Labels["node-role.kubernetes.io/master"]
|
|
||||||
_, isControlPlane := node.Labels["node-role.kubernetes.io/control-plane"]
|
|
||||||
return isMaster || isControlPlane
|
|
||||||
})
|
|
||||||
if len(nodes.Items) == 0 {
|
|
||||||
return nil, fmt.Errorf("there are currently no ready, schedulable control plane nodes in the cluster")
|
|
||||||
}
|
|
||||||
return nodes, nil
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -128,6 +128,9 @@ const (
|
|||||||
|
|
||||||
// SnapshotDeleteTimeout is how long for snapshot to delete snapshotContent.
|
// SnapshotDeleteTimeout is how long for snapshot to delete snapshotContent.
|
||||||
SnapshotDeleteTimeout = 5 * time.Minute
|
SnapshotDeleteTimeout = 5 * time.Minute
|
||||||
|
|
||||||
|
// ControlPlaneLabel is valid label for kubeadm based clusters like kops ONLY
|
||||||
|
ControlPlaneLabel = "node-role.kubernetes.io/control-plane"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@@ -662,6 +665,17 @@ func RunCmdEnv(env []string, command string, args ...string) (string, string, er
|
|||||||
return stdout, stderr, nil
|
return stdout, stderr, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetNodeExternalIPs returns a list of external ip address(es) if any for a node
|
||||||
|
func GetNodeExternalIPs(node *v1.Node) (ips []string) {
|
||||||
|
for j := range node.Status.Addresses {
|
||||||
|
nodeAddress := &node.Status.Addresses[j]
|
||||||
|
if nodeAddress.Type == v1.NodeExternalIP && nodeAddress.Address != "" {
|
||||||
|
ips = append(ips, nodeAddress.Address)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// getControlPlaneAddresses returns the externalIP, internalIP and hostname fields of control plane nodes.
|
// getControlPlaneAddresses returns the externalIP, internalIP and hostname fields of control plane nodes.
|
||||||
// If any of these is unavailable, empty slices are returned.
|
// If any of these is unavailable, empty slices are returned.
|
||||||
func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]string, []string, []string) {
|
func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]string, []string, []string) {
|
||||||
@@ -694,6 +708,33 @@ func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]str
|
|||||||
return externalIPs, internalIPs, hostnames
|
return externalIPs, internalIPs, hostnames
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetControlPlaneNodes returns a list of control plane nodes
|
||||||
|
func GetControlPlaneNodes(ctx context.Context, c clientset.Interface) *v1.NodeList {
|
||||||
|
allNodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
|
||||||
|
ExpectNoError(err, "error reading all nodes")
|
||||||
|
|
||||||
|
var cpNodes v1.NodeList
|
||||||
|
|
||||||
|
for _, node := range allNodes.Items {
|
||||||
|
// Check for the control plane label
|
||||||
|
if _, hasLabel := node.Labels[ControlPlaneLabel]; hasLabel {
|
||||||
|
cpNodes.Items = append(cpNodes.Items, node)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for the specific taint
|
||||||
|
for _, taint := range node.Spec.Taints {
|
||||||
|
// NOTE the taint key is the same as the control plane label
|
||||||
|
if taint.Key == ControlPlaneLabel && taint.Effect == v1.TaintEffectNoSchedule {
|
||||||
|
cpNodes.Items = append(cpNodes.Items, node)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &cpNodes
|
||||||
|
}
|
||||||
|
|
||||||
// GetControlPlaneAddresses returns all IP addresses on which the kubelet can reach the control plane.
|
// GetControlPlaneAddresses returns all IP addresses on which the kubelet can reach the control plane.
|
||||||
// It may return internal and external IPs, even if we expect for
|
// It may return internal and external IPs, even if we expect for
|
||||||
// e.g. internal IPs to be used (issue #56787), so that we can be
|
// e.g. internal IPs to be used (issue #56787), so that we can be
|
||||||
|
|||||||
@@ -20,9 +20,6 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
|
|
||||||
corev1 "k8s.io/api/core/v1"
|
corev1 "k8s.io/api/core/v1"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
||||||
"k8s.io/apimachinery/pkg/labels"
|
|
||||||
clientset "k8s.io/client-go/kubernetes"
|
|
||||||
"k8s.io/kubernetes/test/e2e/framework"
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
|
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
|
||||||
admissionapi "k8s.io/pod-security-admission/api"
|
admissionapi "k8s.io/pod-security-admission/api"
|
||||||
@@ -31,10 +28,6 @@ import (
|
|||||||
"github.com/onsi/gomega"
|
"github.com/onsi/gomega"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
|
||||||
controlPlaneLabel = "node-role.kubernetes.io/control-plane"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Define container for all the test specification aimed at verifying
|
// Define container for all the test specification aimed at verifying
|
||||||
// that kubeadm configures the control-plane node as expected
|
// that kubeadm configures the control-plane node as expected
|
||||||
var _ = Describe("control-plane node", func() {
|
var _ = Describe("control-plane node", func() {
|
||||||
@@ -51,22 +44,14 @@ var _ = Describe("control-plane node", func() {
|
|||||||
// in case you can skip this test with SKIP=multi-node
|
// in case you can skip this test with SKIP=multi-node
|
||||||
ginkgo.It("should be labelled and tainted [multi-node]", func(ctx context.Context) {
|
ginkgo.It("should be labelled and tainted [multi-node]", func(ctx context.Context) {
|
||||||
// get all control-plane nodes (and this implicitly checks that node are properly labeled)
|
// get all control-plane nodes (and this implicitly checks that node are properly labeled)
|
||||||
controlPlanes := getControlPlaneNodes(ctx, f.ClientSet)
|
controlPlanes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
|
||||||
|
|
||||||
// checks if there is at least one control-plane node
|
// checks if there is at least one control-plane node
|
||||||
gomega.Expect(controlPlanes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist. if you are running test on a single-node cluster, you can skip this test with SKIP=multi-node", controlPlaneLabel)
|
gomega.Expect(controlPlanes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist. if you are running test on a single-node cluster, you can skip this test with SKIP=multi-node", framework.ControlPlaneLabel)
|
||||||
|
|
||||||
// checks that the control-plane nodes have the expected taints
|
// checks that the control-plane nodes have the expected taints
|
||||||
for _, cp := range controlPlanes.Items {
|
for _, cp := range controlPlanes.Items {
|
||||||
e2enode.ExpectNodeHasTaint(ctx, f.ClientSet, cp.GetName(), &corev1.Taint{Key: controlPlaneLabel, Effect: corev1.TaintEffectNoSchedule})
|
e2enode.ExpectNodeHasTaint(ctx, f.ClientSet, cp.GetName(), &corev1.Taint{Key: framework.ControlPlaneLabel, Effect: corev1.TaintEffectNoSchedule})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
func getControlPlaneNodes(ctx context.Context, c clientset.Interface) *corev1.NodeList {
|
|
||||||
selector := labels.Set{controlPlaneLabel: ""}.AsSelector()
|
|
||||||
cpNodes, err := c.CoreV1().Nodes().
|
|
||||||
List(ctx, metav1.ListOptions{LabelSelector: selector.String()})
|
|
||||||
framework.ExpectNoError(err, "error reading control-plane nodes")
|
|
||||||
return cpNodes
|
|
||||||
}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user