mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-04 04:08:16 +00:00 
			
		
		
		
	Fix a flaky e2e test of Job completion
The test was flaky because it required the job succeeds 3 times with pseudorandom 50% failure chance within 15 minutes, while there is an exponential back-off delay (10s, 20s, 40s …) capped at 6 minutes before recreating failed pods. As 7 consecutive failures (1/128 chance) could take 20+ minutes, exceeding the timeout, the test failed intermittently because of "timed out waiting for the condition". This PR forces the Pods of a Job to be scheduled to a single node and uses a hostPath volume instead of an emptyDir to persist data across new Pods.
This commit is contained in:
		@@ -26,6 +26,7 @@ import (
 | 
			
		||||
	batchinternal "k8s.io/kubernetes/pkg/apis/batch"
 | 
			
		||||
	"k8s.io/kubernetes/test/e2e/framework"
 | 
			
		||||
	jobutil "k8s.io/kubernetes/test/e2e/framework/job"
 | 
			
		||||
	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 | 
			
		||||
	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 | 
			
		||||
 | 
			
		||||
	"github.com/onsi/ginkgo"
 | 
			
		||||
@@ -94,12 +95,10 @@ var _ = SIGDescribe("Job", func() {
 | 
			
		||||
	framework.ConformanceIt("should run a job to completion when tasks sometimes fail and are locally restarted", func() {
 | 
			
		||||
		ginkgo.By("Creating a job")
 | 
			
		||||
		// One failure, then a success, local restarts.
 | 
			
		||||
		// We can't use the random failure approach used by the
 | 
			
		||||
		// non-local test below, because kubelet will throttle
 | 
			
		||||
		// frequently failing containers in a given pod, ramping
 | 
			
		||||
		// up to 5 minutes between restarts, making test timeouts
 | 
			
		||||
		// due to successive failures too likely with a reasonable
 | 
			
		||||
		// test timeout.
 | 
			
		||||
		// We can't use the random failure approach, because kubelet will
 | 
			
		||||
		// throttle frequently failing containers in a given pod, ramping
 | 
			
		||||
		// up to 5 minutes between restarts, making test timeout due to
 | 
			
		||||
		// successive failures too likely with a reasonable test timeout.
 | 
			
		||||
		job := jobutil.NewTestJob("failOnce", "fail-once-local", v1.RestartPolicyOnFailure, parallelism, completions, nil, backoffLimit)
 | 
			
		||||
		job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
 | 
			
		||||
		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
 | 
			
		||||
@@ -111,18 +110,20 @@ var _ = SIGDescribe("Job", func() {
 | 
			
		||||
 | 
			
		||||
	// Pods sometimes fail, but eventually succeed, after pod restarts
 | 
			
		||||
	ginkgo.It("should run a job to completion when tasks sometimes fail and are not locally restarted", func() {
 | 
			
		||||
		// One failure, then a success, no local restarts.
 | 
			
		||||
		// We can't use the random failure approach, because JobController
 | 
			
		||||
		// will throttle frequently failing Pods of a given Job, ramping
 | 
			
		||||
		// up to 6 minutes between restarts, making test timeout due to
 | 
			
		||||
		// successive failures.
 | 
			
		||||
		// Instead, we force the Job's Pods to be scheduled to a single Node
 | 
			
		||||
		// and use a hostPath volume to persist data across new Pods.
 | 
			
		||||
		ginkgo.By("Looking for a node to schedule job pod")
 | 
			
		||||
		node, err := e2enode.GetRandomReadySchedulableNode(f.ClientSet)
 | 
			
		||||
		framework.ExpectNoError(err)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Creating a job")
 | 
			
		||||
		// 50% chance of container success, local restarts.
 | 
			
		||||
		// Can't use the failOnce approach because that relies
 | 
			
		||||
		// on an emptyDir, which is not preserved across new pods.
 | 
			
		||||
		// Worst case analysis: 15 failures, each taking 1 minute to
 | 
			
		||||
		// run due to some slowness, 1 in 2^15 chance of happening,
 | 
			
		||||
		// causing test flake.  Should be very rare.
 | 
			
		||||
		// With the introduction of backoff limit and high failure rate this
 | 
			
		||||
		// is hitting its timeout, the 3 is a reasonable that should make this
 | 
			
		||||
		// test less flaky, for now.
 | 
			
		||||
		job := jobutil.NewTestJob("randomlySucceedOrFail", "rand-non-local", v1.RestartPolicyNever, parallelism, 3, nil, 999)
 | 
			
		||||
		job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
 | 
			
		||||
		job := jobutil.NewTestJobOnNode("failOnce", "fail-once-non-local", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
 | 
			
		||||
		job, err = jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job)
 | 
			
		||||
		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
 | 
			
		||||
 | 
			
		||||
		ginkgo.By("Ensuring job reaches completions")
 | 
			
		||||
 
 | 
			
		||||
@@ -17,6 +17,7 @@ go_library(
 | 
			
		||||
        "//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
 | 
			
		||||
        "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
 | 
			
		||||
        "//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
 | 
			
		||||
        "//staging/src/k8s.io/apimachinery/pkg/util/rand:go_default_library",
 | 
			
		||||
        "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
 | 
			
		||||
        "//staging/src/k8s.io/client-go/kubernetes:go_default_library",
 | 
			
		||||
        "//test/e2e/framework:go_default_library",
 | 
			
		||||
 
 | 
			
		||||
@@ -20,6 +20,7 @@ import (
 | 
			
		||||
	batchv1 "k8s.io/api/batch/v1"
 | 
			
		||||
	"k8s.io/api/core/v1"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/util/rand"
 | 
			
		||||
	"k8s.io/kubernetes/test/e2e/framework"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
@@ -30,6 +31,13 @@ import (
 | 
			
		||||
// policy of the containers in which the Pod is running. Parallelism is the Job's parallelism, and completions is the
 | 
			
		||||
// Job's required number of completions.
 | 
			
		||||
func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, completions int32, activeDeadlineSeconds *int64, backoffLimit int32) *batchv1.Job {
 | 
			
		||||
	anyNode := ""
 | 
			
		||||
	return NewTestJobOnNode(behavior, name, rPol, parallelism, completions, activeDeadlineSeconds, backoffLimit, anyNode)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// NewTestJobOnNode is similar to NewTestJob but supports specifying a Node on which the Job's Pods will run.
 | 
			
		||||
// Empty nodeName means no node selection constraints.
 | 
			
		||||
func NewTestJobOnNode(behavior, name string, rPol v1.RestartPolicy, parallelism, completions int32, activeDeadlineSeconds *int64, backoffLimit int32, nodeName string) *batchv1.Job {
 | 
			
		||||
	manualSelector := false
 | 
			
		||||
	job := &batchv1.Job{
 | 
			
		||||
		ObjectMeta: metav1.ObjectMeta{
 | 
			
		||||
@@ -72,6 +80,7 @@ func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, compl
 | 
			
		||||
							SecurityContext: &v1.SecurityContext{},
 | 
			
		||||
						},
 | 
			
		||||
					},
 | 
			
		||||
					NodeName: nodeName,
 | 
			
		||||
				},
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
@@ -89,10 +98,21 @@ func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, compl
 | 
			
		||||
		job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "exit $(( $RANDOM / 16384 ))"}
 | 
			
		||||
	case "failOnce":
 | 
			
		||||
		// Fail the first the container of the pod is run, and
 | 
			
		||||
		// succeed the second time. Checks for file on emptydir.
 | 
			
		||||
		// succeed the second time. Checks for file on a data volume.
 | 
			
		||||
		// If present, succeed.  If not, create but fail.
 | 
			
		||||
		// Note that this cannot be used with RestartNever because
 | 
			
		||||
		// it always fails the first time for a pod.
 | 
			
		||||
		// If RestartPolicy is Never, the nodeName should be set to
 | 
			
		||||
		// ensure all job pods run on a single node and the volume
 | 
			
		||||
		// will be mounted from a hostPath instead.
 | 
			
		||||
		if len(nodeName) > 0 {
 | 
			
		||||
			randomDir := "/tmp/job-e2e/" + rand.String(10)
 | 
			
		||||
			hostPathType := v1.HostPathDirectoryOrCreate
 | 
			
		||||
			job.Spec.Template.Spec.Volumes[0].VolumeSource = v1.VolumeSource{HostPath: &v1.HostPathVolumeSource{Path: randomDir, Type: &hostPathType}}
 | 
			
		||||
			// Tests involving r/w operations on hostPath volume needs to run in
 | 
			
		||||
			// privileged mode for SELinux enabled distro, while Windows platform
 | 
			
		||||
			// neither supports nor needs privileged mode.
 | 
			
		||||
			privileged := !framework.NodeOSDistroIs("windows")
 | 
			
		||||
			job.Spec.Template.Spec.Containers[0].SecurityContext.Privileged = &privileged
 | 
			
		||||
		}
 | 
			
		||||
		job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "if [[ -r /data/foo ]] ; then exit 0 ; else touch /data/foo ; exit 1 ; fi"}
 | 
			
		||||
	}
 | 
			
		||||
	return job
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user