Refactor liveness probing

This commit builds on previous work and creates an independent worker for every liveness probe. Liveness probes behave largely the same as readiness probes, so much of the code is shared by introducing a probeType paramater to distinguish the type when it matters. The circular dependency between the runtime and the prober is broken by exposing a shared liveness ResultsManager, owned by the kubelet. Finally, an Updates channel is introduced to the ResultsManager so the kubelet can react to unhealthy containers immediately.
2025-11-01 18:58:18 +00:00 · 2015-10-19 15:15:59 -07:00
parent 0d7b53a201
commit a263c77b65
16 changed files with 510 additions and 396 deletions
--- a/pkg/kubelet/prober/worker_test.go
+++ b/pkg/kubelet/prober/worker_test.go
@@ -17,14 +17,19 @@ limitations under the License.
 package prober

 import (
+	"fmt"
 	"testing"
 	"time"

 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/unversioned"
+	"k8s.io/kubernetes/pkg/client/record"
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
 	"k8s.io/kubernetes/pkg/kubelet/prober/results"
 	"k8s.io/kubernetes/pkg/probe"
+	"k8s.io/kubernetes/pkg/util"
+	"k8s.io/kubernetes/pkg/util/exec"
+	"k8s.io/kubernetes/pkg/util/wait"
 )

 const (
@@ -52,12 +57,11 @@ func TestDoProbe(t *testing.T) {
 	failedStatus.Phase = api.PodFailed

 	tests := []struct {
-		probe     api.Probe
-		podStatus *api.PodStatus
-
-		expectContinue    bool
-		expectReadySet    bool
-		expectedReadiness results.Result
+		probe          api.Probe
+		podStatus      *api.PodStatus
+		expectContinue bool
+		expectSet      bool
+		expectedResult results.Result
 	}{
 		{ // No status.
 			expectContinue: true,
@@ -72,136 +76,158 @@ func TestDoProbe(t *testing.T) {
 		{ // Container waiting
 			podStatus:      &pendingStatus,
 			expectContinue: true,
-			expectReadySet: true,
+			expectSet:      true,
 		},
 		{ // Container terminated
-			podStatus:      &terminatedStatus,
-			expectReadySet: true,
+			podStatus: &terminatedStatus,
+			expectSet: true,
 		},
 		{ // Probe successful.
-			podStatus:         &runningStatus,
-			expectContinue:    true,
-			expectReadySet:    true,
-			expectedReadiness: results.Success,
+			podStatus:      &runningStatus,
+			expectContinue: true,
+			expectSet:      true,
+			expectedResult: results.Success,
 		},
 		{ // Initial delay passed
 			podStatus: &runningStatus,
 			probe: api.Probe{
 				InitialDelaySeconds: -100,
 			},
-			expectContinue:    true,
-			expectReadySet:    true,
-			expectedReadiness: results.Success,
+			expectContinue: true,
+			expectSet:      true,
+			expectedResult: results.Success,
 		},
 	}

-	for i, test := range tests {
-		w := newTestWorker(test.probe)
-		if test.podStatus != nil {
-			m.statusManager.SetPodStatus(w.pod, *test.podStatus)
-		}
-		if c := doProbe(m, w); c != test.expectContinue {
-			t.Errorf("[%d] Expected continue to be %v but got %v", i, test.expectContinue, c)
-		}
-		ready, ok := m.readinessCache.Get(containerID)
-		if ok != test.expectReadySet {
-			t.Errorf("[%d] Expected to have readiness: %v but got %v", i, test.expectReadySet, ok)
-		}
-		if ready != test.expectedReadiness {
-			t.Errorf("[%d] Expected readiness: %v but got %v", i, test.expectedReadiness, ready)
-		}
+	for _, probeType := range [...]probeType{liveness, readiness} {
+		for i, test := range tests {
+			w := newTestWorker(m, probeType, test.probe)
+			if test.podStatus != nil {
+				m.statusManager.SetPodStatus(w.pod, *test.podStatus)
+			}
+			if c := w.doProbe(); c != test.expectContinue {
+				t.Errorf("[%s-%d] Expected continue to be %v but got %v", probeType, i, test.expectContinue, c)
+			}
+			result, ok := resultsManager(m, probeType).Get(containerID)
+			if ok != test.expectSet {
+				t.Errorf("[%s-%d] Expected to have result: %v but got %v", probeType, i, test.expectSet, ok)
+			}
+			if result != test.expectedResult {
+				t.Errorf("[%s-%d] Expected result: %v but got %v", probeType, i, test.expectedResult, result)
+			}

-		// Clean up.
-		m.statusManager.DeletePodStatus(podUID)
-		m.readinessCache.Remove(containerID)
+			// Clean up.
+			m.statusManager.DeletePodStatus(podUID)
+			resultsManager(m, probeType).Remove(containerID)
+		}
 	}
 }

 func TestInitialDelay(t *testing.T) {
 	m := newTestManager()
-	w := newTestWorker(api.Probe{
-		InitialDelaySeconds: 10,
-	})
-	m.statusManager.SetPodStatus(w.pod, getRunningStatus())

-	if !doProbe(m, w) {
-		t.Errorf("Expected to continue, but did not")
-	}
+	for _, probeType := range [...]probeType{liveness, readiness} {
+		w := newTestWorker(m, probeType, api.Probe{
+			InitialDelaySeconds: 10,
+		})
+		m.statusManager.SetPodStatus(w.pod, getRunningStatus())

-	ready, ok := m.readinessCache.Get(containerID)
-	if !ok {
-		t.Errorf("Expected readiness to be false, but was not set")
-	} else if ready {
-		t.Errorf("Expected readiness to be false, but was true")
-	}
+		if !w.doProbe() {
+			t.Errorf("[%s] Expected to continue, but did not", probeType)
+		}

-	// 100 seconds later...
-	laterStatus := getRunningStatus()
-	laterStatus.ContainerStatuses[0].State.Running.StartedAt.Time =
-		time.Now().Add(-100 * time.Second)
-	m.statusManager.SetPodStatus(w.pod, laterStatus)
+		expectedResult := results.Result(probeType == liveness)
+		result, ok := resultsManager(m, probeType).Get(containerID)
+		if !ok {
+			t.Errorf("[%s] Expected result to be set during initial delay, but was not set", probeType)
+		} else if result != expectedResult {
+			t.Errorf("[%s] Expected result to be %v during initial delay, but was %v",
+				probeType, expectedResult, result)
+		}

-	// Second call should succeed (already waited).
-	if !doProbe(m, w) {
-		t.Errorf("Expected to continue, but did not")
-	}
+		// 100 seconds later...
+		laterStatus := getRunningStatus()
+		laterStatus.ContainerStatuses[0].State.Running.StartedAt.Time =
+			time.Now().Add(-100 * time.Second)
+		m.statusManager.SetPodStatus(w.pod, laterStatus)

-	ready, ok = m.readinessCache.Get(containerID)
-	if !ok {
-		t.Errorf("Expected readiness to be true, but was not set")
-	} else if !ready {
-		t.Errorf("Expected readiness to be true, but was false")
+		// Second call should succeed (already waited).
+		if !w.doProbe() {
+			t.Errorf("[%s] Expected to continue, but did not", probeType)
+		}
+
+		result, ok = resultsManager(m, probeType).Get(containerID)
+		if !ok {
+			t.Errorf("[%s] Expected result to be true, but was not set", probeType)
+		} else if !result {
+			t.Errorf("[%s] Expected result to be true, but was false", probeType)
+		}
 	}
 }

 func TestCleanUp(t *testing.T) {
 	m := newTestManager()
-	pod := getTestPod(api.Probe{})
-	m.statusManager.SetPodStatus(&pod, getRunningStatus())
-	m.readinessCache.Set(containerID, results.Success)
-	w := m.newWorker(&pod, pod.Spec.Containers[0])
-	m.readinessProbes[containerPath{podUID, containerName}] = w

-	if ready, _ := m.readinessCache.Get(containerID); !ready {
-		t.Fatal("Expected readiness to be true.")
-	}
+	for _, probeType := range [...]probeType{liveness, readiness} {
+		key := probeKey{podUID, containerName, probeType}
+		w := newTestWorker(m, probeType, api.Probe{})
+		m.statusManager.SetPodStatus(w.pod, getRunningStatus())
+		go w.run()
+		m.workers[key] = w

-	close(w.stop)
-	if err := waitForWorkerExit(m, []containerPath{{podUID, containerName}}); err != nil {
-		t.Fatal(err)
-	}
+		// Wait for worker to run.
+		condition := func() (bool, error) {
+			ready, _ := resultsManager(m, probeType).Get(containerID)
+			return ready == results.Success, nil
+		}
+		if ready, _ := condition(); !ready {
+			if err := wait.Poll(100*time.Millisecond, util.ForeverTestTimeout, condition); err != nil {
+				t.Fatalf("[%s] Error waiting for worker ready: %v", probeType, err)
+			}
+		}

-	if _, ok := m.readinessCache.Get(containerID); ok {
-		t.Error("Expected readiness to be cleared.")
-	}
-	if _, ok := m.readinessProbes[containerPath{podUID, containerName}]; ok {
-		t.Error("Expected worker to be cleared.")
+		close(w.stop)
+		if err := waitForWorkerExit(m, []probeKey{key}); err != nil {
+			t.Fatalf("[%s] error waiting for worker exit: %v", probeType, err)
+		}
+
+		if _, ok := resultsManager(m, probeType).Get(containerID); ok {
+			t.Errorf("[%s] Expected result to be cleared.", probeType)
+		}
+		if _, ok := m.workers[key]; ok {
+			t.Errorf("[%s] Expected worker to be cleared.", probeType)
+		}
 	}
 }

 func TestHandleCrash(t *testing.T) {
 	m := newTestManager()
-	m.prober = CrashingProber{}
-	w := newTestWorker(api.Probe{})
+	m.prober = &prober{
+		refManager: kubecontainer.NewRefManager(),
+		recorder:   &record.FakeRecorder{},
+		exec:       crashingExecProber{},
+	}
+
+	w := newTestWorker(m, readiness, api.Probe{})
 	m.statusManager.SetPodStatus(w.pod, getRunningStatus())

 	// doProbe should recover from the crash, and keep going.
-	if !doProbe(m, w) {
+	if !w.doProbe() {
 		t.Error("Expected to keep going, but terminated.")
 	}
-	if _, ok := m.readinessCache.Get(containerID); ok {
+	if _, ok := m.readinessManager.Get(containerID); ok {
 		t.Error("Expected readiness to be unchanged from crash.")
 	}
 }

-func newTestWorker(probeSpec api.Probe) *worker {
-	pod := getTestPod(probeSpec)
-	return &worker{
-		stop:      make(chan struct{}),
-		pod:       &pod,
-		container: pod.Spec.Containers[0],
-		spec:      &probeSpec,
+func newTestWorker(m *manager, probeType probeType, probeSpec api.Probe) *worker {
+	// All tests rely on the fake exec prober.
+	probeSpec.Handler = api.Handler{
+		Exec: &api.ExecAction{},
 	}
+
+	pod := getTestPod(probeType, probeSpec)
+	return newWorker(m, probeType, &pod, pod.Spec.Containers[0])
 }

 func getRunningStatus() api.PodStatus {
@@ -217,10 +243,15 @@ func getRunningStatus() api.PodStatus {
 	return podStatus
 }

-func getTestPod(probeSpec api.Probe) api.Pod {
+func getTestPod(probeType probeType, probeSpec api.Probe) api.Pod {
 	container := api.Container{
-		Name:           containerName,
-		ReadinessProbe: &probeSpec,
+		Name: containerName,
+	}
+	switch probeType {
+	case readiness:
+		container.ReadinessProbe = &probeSpec
+	case liveness:
+		container.LivenessProbe = &probeSpec
 	}
 	pod := api.Pod{
 		Spec: api.PodSpec{
@@ -232,12 +263,18 @@ func getTestPod(probeSpec api.Probe) api.Pod {
 	return pod
 }

-type CrashingProber struct{}
-
-func (f CrashingProber) ProbeLiveness(_ *api.Pod, _ api.PodStatus, c api.Container, _ kubecontainer.ContainerID, _ int64) (probe.Result, error) {
-	panic("Intentional ProbeLiveness crash.")
+func resultsManager(m *manager, probeType probeType) results.Manager {
+	switch probeType {
+	case readiness:
+		return m.readinessManager
+	case liveness:
+		return m.livenessManager
+	}
+	panic(fmt.Errorf("Unhandled case: %v", probeType))
 }

-func (f CrashingProber) ProbeReadiness(_ *api.Pod, _ api.PodStatus, c api.Container, _ kubecontainer.ContainerID) (probe.Result, error) {
-	panic("Intentional ProbeReadiness crash.")
+type crashingExecProber struct{}
+
+func (p crashingExecProber) Probe(_ exec.Cmd) (probe.Result, string, error) {
+	panic("Intentional Probe crash.")
 }