mirror of
https://github.com/optim-enterprises-bv/kubernetes.git
synced 2025-11-01 18:58:18 +00:00
Refactor liveness probing
This commit builds on previous work and creates an independent worker for every liveness probe. Liveness probes behave largely the same as readiness probes, so much of the code is shared by introducing a probeType paramater to distinguish the type when it matters. The circular dependency between the runtime and the prober is broken by exposing a shared liveness ResultsManager, owned by the kubelet. Finally, an Updates channel is introduced to the ResultsManager so the kubelet can react to unhealthy containers immediately.
This commit is contained in:
@@ -17,14 +17,19 @@ limitations under the License.
|
||||
package prober
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/unversioned"
|
||||
"k8s.io/kubernetes/pkg/client/record"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
"k8s.io/kubernetes/pkg/kubelet/prober/results"
|
||||
"k8s.io/kubernetes/pkg/probe"
|
||||
"k8s.io/kubernetes/pkg/util"
|
||||
"k8s.io/kubernetes/pkg/util/exec"
|
||||
"k8s.io/kubernetes/pkg/util/wait"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -52,12 +57,11 @@ func TestDoProbe(t *testing.T) {
|
||||
failedStatus.Phase = api.PodFailed
|
||||
|
||||
tests := []struct {
|
||||
probe api.Probe
|
||||
podStatus *api.PodStatus
|
||||
|
||||
expectContinue bool
|
||||
expectReadySet bool
|
||||
expectedReadiness results.Result
|
||||
probe api.Probe
|
||||
podStatus *api.PodStatus
|
||||
expectContinue bool
|
||||
expectSet bool
|
||||
expectedResult results.Result
|
||||
}{
|
||||
{ // No status.
|
||||
expectContinue: true,
|
||||
@@ -72,136 +76,158 @@ func TestDoProbe(t *testing.T) {
|
||||
{ // Container waiting
|
||||
podStatus: &pendingStatus,
|
||||
expectContinue: true,
|
||||
expectReadySet: true,
|
||||
expectSet: true,
|
||||
},
|
||||
{ // Container terminated
|
||||
podStatus: &terminatedStatus,
|
||||
expectReadySet: true,
|
||||
podStatus: &terminatedStatus,
|
||||
expectSet: true,
|
||||
},
|
||||
{ // Probe successful.
|
||||
podStatus: &runningStatus,
|
||||
expectContinue: true,
|
||||
expectReadySet: true,
|
||||
expectedReadiness: results.Success,
|
||||
podStatus: &runningStatus,
|
||||
expectContinue: true,
|
||||
expectSet: true,
|
||||
expectedResult: results.Success,
|
||||
},
|
||||
{ // Initial delay passed
|
||||
podStatus: &runningStatus,
|
||||
probe: api.Probe{
|
||||
InitialDelaySeconds: -100,
|
||||
},
|
||||
expectContinue: true,
|
||||
expectReadySet: true,
|
||||
expectedReadiness: results.Success,
|
||||
expectContinue: true,
|
||||
expectSet: true,
|
||||
expectedResult: results.Success,
|
||||
},
|
||||
}
|
||||
|
||||
for i, test := range tests {
|
||||
w := newTestWorker(test.probe)
|
||||
if test.podStatus != nil {
|
||||
m.statusManager.SetPodStatus(w.pod, *test.podStatus)
|
||||
}
|
||||
if c := doProbe(m, w); c != test.expectContinue {
|
||||
t.Errorf("[%d] Expected continue to be %v but got %v", i, test.expectContinue, c)
|
||||
}
|
||||
ready, ok := m.readinessCache.Get(containerID)
|
||||
if ok != test.expectReadySet {
|
||||
t.Errorf("[%d] Expected to have readiness: %v but got %v", i, test.expectReadySet, ok)
|
||||
}
|
||||
if ready != test.expectedReadiness {
|
||||
t.Errorf("[%d] Expected readiness: %v but got %v", i, test.expectedReadiness, ready)
|
||||
}
|
||||
for _, probeType := range [...]probeType{liveness, readiness} {
|
||||
for i, test := range tests {
|
||||
w := newTestWorker(m, probeType, test.probe)
|
||||
if test.podStatus != nil {
|
||||
m.statusManager.SetPodStatus(w.pod, *test.podStatus)
|
||||
}
|
||||
if c := w.doProbe(); c != test.expectContinue {
|
||||
t.Errorf("[%s-%d] Expected continue to be %v but got %v", probeType, i, test.expectContinue, c)
|
||||
}
|
||||
result, ok := resultsManager(m, probeType).Get(containerID)
|
||||
if ok != test.expectSet {
|
||||
t.Errorf("[%s-%d] Expected to have result: %v but got %v", probeType, i, test.expectSet, ok)
|
||||
}
|
||||
if result != test.expectedResult {
|
||||
t.Errorf("[%s-%d] Expected result: %v but got %v", probeType, i, test.expectedResult, result)
|
||||
}
|
||||
|
||||
// Clean up.
|
||||
m.statusManager.DeletePodStatus(podUID)
|
||||
m.readinessCache.Remove(containerID)
|
||||
// Clean up.
|
||||
m.statusManager.DeletePodStatus(podUID)
|
||||
resultsManager(m, probeType).Remove(containerID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestInitialDelay(t *testing.T) {
|
||||
m := newTestManager()
|
||||
w := newTestWorker(api.Probe{
|
||||
InitialDelaySeconds: 10,
|
||||
})
|
||||
m.statusManager.SetPodStatus(w.pod, getRunningStatus())
|
||||
|
||||
if !doProbe(m, w) {
|
||||
t.Errorf("Expected to continue, but did not")
|
||||
}
|
||||
for _, probeType := range [...]probeType{liveness, readiness} {
|
||||
w := newTestWorker(m, probeType, api.Probe{
|
||||
InitialDelaySeconds: 10,
|
||||
})
|
||||
m.statusManager.SetPodStatus(w.pod, getRunningStatus())
|
||||
|
||||
ready, ok := m.readinessCache.Get(containerID)
|
||||
if !ok {
|
||||
t.Errorf("Expected readiness to be false, but was not set")
|
||||
} else if ready {
|
||||
t.Errorf("Expected readiness to be false, but was true")
|
||||
}
|
||||
if !w.doProbe() {
|
||||
t.Errorf("[%s] Expected to continue, but did not", probeType)
|
||||
}
|
||||
|
||||
// 100 seconds later...
|
||||
laterStatus := getRunningStatus()
|
||||
laterStatus.ContainerStatuses[0].State.Running.StartedAt.Time =
|
||||
time.Now().Add(-100 * time.Second)
|
||||
m.statusManager.SetPodStatus(w.pod, laterStatus)
|
||||
expectedResult := results.Result(probeType == liveness)
|
||||
result, ok := resultsManager(m, probeType).Get(containerID)
|
||||
if !ok {
|
||||
t.Errorf("[%s] Expected result to be set during initial delay, but was not set", probeType)
|
||||
} else if result != expectedResult {
|
||||
t.Errorf("[%s] Expected result to be %v during initial delay, but was %v",
|
||||
probeType, expectedResult, result)
|
||||
}
|
||||
|
||||
// Second call should succeed (already waited).
|
||||
if !doProbe(m, w) {
|
||||
t.Errorf("Expected to continue, but did not")
|
||||
}
|
||||
// 100 seconds later...
|
||||
laterStatus := getRunningStatus()
|
||||
laterStatus.ContainerStatuses[0].State.Running.StartedAt.Time =
|
||||
time.Now().Add(-100 * time.Second)
|
||||
m.statusManager.SetPodStatus(w.pod, laterStatus)
|
||||
|
||||
ready, ok = m.readinessCache.Get(containerID)
|
||||
if !ok {
|
||||
t.Errorf("Expected readiness to be true, but was not set")
|
||||
} else if !ready {
|
||||
t.Errorf("Expected readiness to be true, but was false")
|
||||
// Second call should succeed (already waited).
|
||||
if !w.doProbe() {
|
||||
t.Errorf("[%s] Expected to continue, but did not", probeType)
|
||||
}
|
||||
|
||||
result, ok = resultsManager(m, probeType).Get(containerID)
|
||||
if !ok {
|
||||
t.Errorf("[%s] Expected result to be true, but was not set", probeType)
|
||||
} else if !result {
|
||||
t.Errorf("[%s] Expected result to be true, but was false", probeType)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCleanUp(t *testing.T) {
|
||||
m := newTestManager()
|
||||
pod := getTestPod(api.Probe{})
|
||||
m.statusManager.SetPodStatus(&pod, getRunningStatus())
|
||||
m.readinessCache.Set(containerID, results.Success)
|
||||
w := m.newWorker(&pod, pod.Spec.Containers[0])
|
||||
m.readinessProbes[containerPath{podUID, containerName}] = w
|
||||
|
||||
if ready, _ := m.readinessCache.Get(containerID); !ready {
|
||||
t.Fatal("Expected readiness to be true.")
|
||||
}
|
||||
for _, probeType := range [...]probeType{liveness, readiness} {
|
||||
key := probeKey{podUID, containerName, probeType}
|
||||
w := newTestWorker(m, probeType, api.Probe{})
|
||||
m.statusManager.SetPodStatus(w.pod, getRunningStatus())
|
||||
go w.run()
|
||||
m.workers[key] = w
|
||||
|
||||
close(w.stop)
|
||||
if err := waitForWorkerExit(m, []containerPath{{podUID, containerName}}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Wait for worker to run.
|
||||
condition := func() (bool, error) {
|
||||
ready, _ := resultsManager(m, probeType).Get(containerID)
|
||||
return ready == results.Success, nil
|
||||
}
|
||||
if ready, _ := condition(); !ready {
|
||||
if err := wait.Poll(100*time.Millisecond, util.ForeverTestTimeout, condition); err != nil {
|
||||
t.Fatalf("[%s] Error waiting for worker ready: %v", probeType, err)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := m.readinessCache.Get(containerID); ok {
|
||||
t.Error("Expected readiness to be cleared.")
|
||||
}
|
||||
if _, ok := m.readinessProbes[containerPath{podUID, containerName}]; ok {
|
||||
t.Error("Expected worker to be cleared.")
|
||||
close(w.stop)
|
||||
if err := waitForWorkerExit(m, []probeKey{key}); err != nil {
|
||||
t.Fatalf("[%s] error waiting for worker exit: %v", probeType, err)
|
||||
}
|
||||
|
||||
if _, ok := resultsManager(m, probeType).Get(containerID); ok {
|
||||
t.Errorf("[%s] Expected result to be cleared.", probeType)
|
||||
}
|
||||
if _, ok := m.workers[key]; ok {
|
||||
t.Errorf("[%s] Expected worker to be cleared.", probeType)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleCrash(t *testing.T) {
|
||||
m := newTestManager()
|
||||
m.prober = CrashingProber{}
|
||||
w := newTestWorker(api.Probe{})
|
||||
m.prober = &prober{
|
||||
refManager: kubecontainer.NewRefManager(),
|
||||
recorder: &record.FakeRecorder{},
|
||||
exec: crashingExecProber{},
|
||||
}
|
||||
|
||||
w := newTestWorker(m, readiness, api.Probe{})
|
||||
m.statusManager.SetPodStatus(w.pod, getRunningStatus())
|
||||
|
||||
// doProbe should recover from the crash, and keep going.
|
||||
if !doProbe(m, w) {
|
||||
if !w.doProbe() {
|
||||
t.Error("Expected to keep going, but terminated.")
|
||||
}
|
||||
if _, ok := m.readinessCache.Get(containerID); ok {
|
||||
if _, ok := m.readinessManager.Get(containerID); ok {
|
||||
t.Error("Expected readiness to be unchanged from crash.")
|
||||
}
|
||||
}
|
||||
|
||||
func newTestWorker(probeSpec api.Probe) *worker {
|
||||
pod := getTestPod(probeSpec)
|
||||
return &worker{
|
||||
stop: make(chan struct{}),
|
||||
pod: &pod,
|
||||
container: pod.Spec.Containers[0],
|
||||
spec: &probeSpec,
|
||||
func newTestWorker(m *manager, probeType probeType, probeSpec api.Probe) *worker {
|
||||
// All tests rely on the fake exec prober.
|
||||
probeSpec.Handler = api.Handler{
|
||||
Exec: &api.ExecAction{},
|
||||
}
|
||||
|
||||
pod := getTestPod(probeType, probeSpec)
|
||||
return newWorker(m, probeType, &pod, pod.Spec.Containers[0])
|
||||
}
|
||||
|
||||
func getRunningStatus() api.PodStatus {
|
||||
@@ -217,10 +243,15 @@ func getRunningStatus() api.PodStatus {
|
||||
return podStatus
|
||||
}
|
||||
|
||||
func getTestPod(probeSpec api.Probe) api.Pod {
|
||||
func getTestPod(probeType probeType, probeSpec api.Probe) api.Pod {
|
||||
container := api.Container{
|
||||
Name: containerName,
|
||||
ReadinessProbe: &probeSpec,
|
||||
Name: containerName,
|
||||
}
|
||||
switch probeType {
|
||||
case readiness:
|
||||
container.ReadinessProbe = &probeSpec
|
||||
case liveness:
|
||||
container.LivenessProbe = &probeSpec
|
||||
}
|
||||
pod := api.Pod{
|
||||
Spec: api.PodSpec{
|
||||
@@ -232,12 +263,18 @@ func getTestPod(probeSpec api.Probe) api.Pod {
|
||||
return pod
|
||||
}
|
||||
|
||||
type CrashingProber struct{}
|
||||
|
||||
func (f CrashingProber) ProbeLiveness(_ *api.Pod, _ api.PodStatus, c api.Container, _ kubecontainer.ContainerID, _ int64) (probe.Result, error) {
|
||||
panic("Intentional ProbeLiveness crash.")
|
||||
func resultsManager(m *manager, probeType probeType) results.Manager {
|
||||
switch probeType {
|
||||
case readiness:
|
||||
return m.readinessManager
|
||||
case liveness:
|
||||
return m.livenessManager
|
||||
}
|
||||
panic(fmt.Errorf("Unhandled case: %v", probeType))
|
||||
}
|
||||
|
||||
func (f CrashingProber) ProbeReadiness(_ *api.Pod, _ api.PodStatus, c api.Container, _ kubecontainer.ContainerID) (probe.Result, error) {
|
||||
panic("Intentional ProbeReadiness crash.")
|
||||
type crashingExecProber struct{}
|
||||
|
||||
func (p crashingExecProber) Probe(_ exec.Cmd) (probe.Result, string, error) {
|
||||
panic("Intentional Probe crash.")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user