mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-04 04:08:16 +00:00 
			
		
		
		
	Merge pull request #61478 from shyamjvs/capture-pod-startup-phases-as-metrics
Automatic merge from submit-queue (batch tested with PRs 61378, 60915, 61499, 61507, 61478). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Capture pod startup phases as metrics Learning from https://github.com/kubernetes/kubernetes/issues/60589, we should also start collecting and graphing sub-parts of pod-startup latency. /sig scalability /kind feature /priority important-soon /cc @wojtek-t ```release-note NONE ```
This commit is contained in:
		@@ -43,7 +43,6 @@ const (
 | 
			
		||||
	// NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node.
 | 
			
		||||
	NodeStartupThreshold = 4 * time.Second
 | 
			
		||||
 | 
			
		||||
	podStartupThreshold time.Duration = 5 * time.Second
 | 
			
		||||
	// We are setting 1s threshold for apicalls even in small clusters to avoid flakes.
 | 
			
		||||
	// The problem is that if long GC is happening in small clusters (where we have e.g.
 | 
			
		||||
	// 1-core master machines) and tests are pretty short, it may consume significant
 | 
			
		||||
@@ -188,7 +187,11 @@ type LatencyMetric struct {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type PodStartupLatency struct {
 | 
			
		||||
	Latency LatencyMetric `json:"latency"`
 | 
			
		||||
	CreateToScheduleLatency LatencyMetric `json:"createToScheduleLatency"`
 | 
			
		||||
	ScheduleToRunLatency    LatencyMetric `json:"scheduleToRunLatency"`
 | 
			
		||||
	RunToWatchLatency       LatencyMetric `json:"runToWatchLatency"`
 | 
			
		||||
	ScheduleToWatchLatency  LatencyMetric `json:"scheduleToWatchLatency"`
 | 
			
		||||
	E2ELatency              LatencyMetric `json:"e2eLatency"`
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (l *PodStartupLatency) SummaryKind() string {
 | 
			
		||||
@@ -398,17 +401,17 @@ func HighLatencyRequests(c clientset.Interface, nodeCount int) (int, *APIRespons
 | 
			
		||||
	return badMetrics, metrics, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Verifies whether 50, 90 and 99th percentiles of PodStartupLatency are
 | 
			
		||||
// within the threshold.
 | 
			
		||||
func VerifyPodStartupLatency(latency *PodStartupLatency) error {
 | 
			
		||||
	if latency.Latency.Perc50 > podStartupThreshold {
 | 
			
		||||
		return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.Latency.Perc50)
 | 
			
		||||
// Verifies whether 50, 90 and 99th percentiles of a latency metric are
 | 
			
		||||
// within the expected threshold.
 | 
			
		||||
func VerifyLatencyWithinThreshold(threshold, actual LatencyMetric, metricName string) error {
 | 
			
		||||
	if actual.Perc50 > threshold.Perc50 {
 | 
			
		||||
		return fmt.Errorf("too high %v latency 50th percentile: %v", metricName, actual.Perc50)
 | 
			
		||||
	}
 | 
			
		||||
	if latency.Latency.Perc90 > podStartupThreshold {
 | 
			
		||||
		return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.Latency.Perc90)
 | 
			
		||||
	if actual.Perc90 > threshold.Perc90 {
 | 
			
		||||
		return fmt.Errorf("too high %v latency 90th percentile: %v", metricName, actual.Perc90)
 | 
			
		||||
	}
 | 
			
		||||
	if latency.Latency.Perc99 > podStartupThreshold {
 | 
			
		||||
		return fmt.Errorf("too high pod startup latency 99th percentile: %v", latency.Latency.Perc99)
 | 
			
		||||
	if actual.Perc99 > threshold.Perc99 {
 | 
			
		||||
		return fmt.Errorf("too high %v latency 99th percentile: %v", metricName, actual.Perc99)
 | 
			
		||||
	}
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -53,22 +53,29 @@ func ApiCallToPerfData(apicalls *APIResponsiveness) *perftype.PerfData {
 | 
			
		||||
	return perfData
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// PodStartupLatencyToPerfData transforms PodStartupLatency to PerfData.
 | 
			
		||||
func PodStartupLatencyToPerfData(latency *PodStartupLatency) *perftype.PerfData {
 | 
			
		||||
	perfData := &perftype.PerfData{Version: currentApiCallMetricsVersion}
 | 
			
		||||
	item := perftype.DataItem{
 | 
			
		||||
func latencyToPerfData(l LatencyMetric, name string) perftype.DataItem {
 | 
			
		||||
	return perftype.DataItem{
 | 
			
		||||
		Data: map[string]float64{
 | 
			
		||||
			"Perc50":  float64(latency.Latency.Perc50) / 1000000, // us -> ms
 | 
			
		||||
			"Perc90":  float64(latency.Latency.Perc90) / 1000000,
 | 
			
		||||
			"Perc99":  float64(latency.Latency.Perc99) / 1000000,
 | 
			
		||||
			"Perc100": float64(latency.Latency.Perc100) / 1000000,
 | 
			
		||||
			"Perc50":  float64(l.Perc50) / 1000000, // us -> ms
 | 
			
		||||
			"Perc90":  float64(l.Perc90) / 1000000,
 | 
			
		||||
			"Perc99":  float64(l.Perc99) / 1000000,
 | 
			
		||||
			"Perc100": float64(l.Perc100) / 1000000,
 | 
			
		||||
		},
 | 
			
		||||
		Unit: "ms",
 | 
			
		||||
		Labels: map[string]string{
 | 
			
		||||
			"Metric": "pod_startup",
 | 
			
		||||
			"Metric": name,
 | 
			
		||||
		},
 | 
			
		||||
	}
 | 
			
		||||
	perfData.DataItems = append(perfData.DataItems, item)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// PodStartupLatencyToPerfData transforms PodStartupLatency to PerfData.
 | 
			
		||||
func PodStartupLatencyToPerfData(latency *PodStartupLatency) *perftype.PerfData {
 | 
			
		||||
	perfData := &perftype.PerfData{Version: currentApiCallMetricsVersion}
 | 
			
		||||
	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.CreateToScheduleLatency, "create_to_schedule"))
 | 
			
		||||
	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.ScheduleToRunLatency, "schedule_to_run"))
 | 
			
		||||
	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.RunToWatchLatency, "run_to_watch"))
 | 
			
		||||
	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.ScheduleToWatchLatency, "schedule_to_watch"))
 | 
			
		||||
	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.E2ELatency, "pod_startup"))
 | 
			
		||||
	return perfData
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -50,6 +50,7 @@ import (
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	PodStartupLatencyThreshold = 5 * time.Second
 | 
			
		||||
	MinSaturationThreshold     = 2 * time.Minute
 | 
			
		||||
	MinPodsPerSecondThroughput = 8
 | 
			
		||||
	DensityPollInterval        = 10 * time.Second
 | 
			
		||||
@@ -829,16 +830,29 @@ var _ = SIGDescribe("Density", func() {
 | 
			
		||||
				sort.Sort(framework.LatencySlice(schedToWatchLag))
 | 
			
		||||
				sort.Sort(framework.LatencySlice(e2eLag))
 | 
			
		||||
 | 
			
		||||
				framework.PrintLatencies(scheduleLag, "worst schedule latencies")
 | 
			
		||||
				framework.PrintLatencies(startupLag, "worst run-after-schedule latencies")
 | 
			
		||||
				framework.PrintLatencies(watchLag, "worst watch latencies")
 | 
			
		||||
				framework.PrintLatencies(schedToWatchLag, "worst scheduled-to-end total latencies")
 | 
			
		||||
				framework.PrintLatencies(e2eLag, "worst e2e total latencies")
 | 
			
		||||
				framework.PrintLatencies(scheduleLag, "worst create-to-schedule latencies")
 | 
			
		||||
				framework.PrintLatencies(startupLag, "worst schedule-to-run latencies")
 | 
			
		||||
				framework.PrintLatencies(watchLag, "worst run-to-watch latencies")
 | 
			
		||||
				framework.PrintLatencies(schedToWatchLag, "worst schedule-to-watch latencies")
 | 
			
		||||
				framework.PrintLatencies(e2eLag, "worst e2e latencies")
 | 
			
		||||
 | 
			
		||||
				// Capture latency metrics related to pod-startup.
 | 
			
		||||
				podStartupLatency := &framework.PodStartupLatency{
 | 
			
		||||
					CreateToScheduleLatency: framework.ExtractLatencyMetrics(scheduleLag),
 | 
			
		||||
					ScheduleToRunLatency:    framework.ExtractLatencyMetrics(startupLag),
 | 
			
		||||
					RunToWatchLatency:       framework.ExtractLatencyMetrics(watchLag),
 | 
			
		||||
					ScheduleToWatchLatency:  framework.ExtractLatencyMetrics(schedToWatchLag),
 | 
			
		||||
					E2ELatency:              framework.ExtractLatencyMetrics(e2eLag),
 | 
			
		||||
				}
 | 
			
		||||
				f.TestSummaries = append(f.TestSummaries, podStartupLatency)
 | 
			
		||||
 | 
			
		||||
				// Test whether e2e pod startup time is acceptable.
 | 
			
		||||
				podStartupLatency := &framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLag)}
 | 
			
		||||
				f.TestSummaries = append(f.TestSummaries, podStartupLatency)
 | 
			
		||||
				framework.ExpectNoError(framework.VerifyPodStartupLatency(podStartupLatency))
 | 
			
		||||
				podStartupLatencyThreshold := framework.LatencyMetric{
 | 
			
		||||
					Perc50: PodStartupLatencyThreshold,
 | 
			
		||||
					Perc90: PodStartupLatencyThreshold,
 | 
			
		||||
					Perc99: PodStartupLatencyThreshold,
 | 
			
		||||
				}
 | 
			
		||||
				framework.ExpectNoError(framework.VerifyLatencyWithinThreshold(podStartupLatencyThreshold, podStartupLatency.E2ELatency, "pod startup"))
 | 
			
		||||
 | 
			
		||||
				framework.LogSuspiciousLatency(startupLag, e2eLag, nodeCount, c)
 | 
			
		||||
				latencyMeasurementPhase.End()
 | 
			
		||||
 
 | 
			
		||||
@@ -472,21 +472,6 @@ func getPodStartLatency(node string) (framework.KubeletLatencyMetrics, error) {
 | 
			
		||||
	return latencyMetrics, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// verifyPodStartupLatency verifies whether 50, 90 and 99th percentiles of PodStartupLatency are
 | 
			
		||||
// within the threshold.
 | 
			
		||||
func verifyPodStartupLatency(expect, actual framework.LatencyMetric) error {
 | 
			
		||||
	if actual.Perc50 > expect.Perc50 {
 | 
			
		||||
		return fmt.Errorf("too high pod startup latency 50th percentile: %v", actual.Perc50)
 | 
			
		||||
	}
 | 
			
		||||
	if actual.Perc90 > expect.Perc90 {
 | 
			
		||||
		return fmt.Errorf("too high pod startup latency 90th percentile: %v", actual.Perc90)
 | 
			
		||||
	}
 | 
			
		||||
	if actual.Perc99 > expect.Perc99 {
 | 
			
		||||
		return fmt.Errorf("too high pod startup latency 99th percentile: %v", actual.Perc99)
 | 
			
		||||
	}
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// newInformerWatchPod creates an informer to check whether all pods are running.
 | 
			
		||||
func newInformerWatchPod(f *framework.Framework, mutex *sync.Mutex, watchTimes map[string]metav1.Time, podType string) cache.Controller {
 | 
			
		||||
	ns := f.Namespace.Name
 | 
			
		||||
@@ -556,14 +541,14 @@ func logAndVerifyLatency(batchLag time.Duration, e2eLags []framework.PodLatencyD
 | 
			
		||||
	latencyMetrics, _ := getPodStartLatency(kubeletAddr)
 | 
			
		||||
	framework.Logf("Kubelet Prometheus metrics (not reset):\n%s", framework.PrettyPrintJSON(latencyMetrics))
 | 
			
		||||
 | 
			
		||||
	podCreateLatency := framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLags)}
 | 
			
		||||
	podStartupLatency := framework.ExtractLatencyMetrics(e2eLags)
 | 
			
		||||
 | 
			
		||||
	// log latency perf data
 | 
			
		||||
	logPerfData(getLatencyPerfData(podCreateLatency.Latency, testInfo), "latency")
 | 
			
		||||
	logPerfData(getLatencyPerfData(podStartupLatency, testInfo), "latency")
 | 
			
		||||
 | 
			
		||||
	if isVerify {
 | 
			
		||||
		// check whether e2e pod startup time is acceptable.
 | 
			
		||||
		framework.ExpectNoError(verifyPodStartupLatency(podStartupLimits, podCreateLatency.Latency))
 | 
			
		||||
		framework.ExpectNoError(framework.VerifyLatencyWithinThreshold(podStartupLimits, podStartupLatency, "pod startup"))
 | 
			
		||||
 | 
			
		||||
		// check bactch pod creation latency
 | 
			
		||||
		if podBatchStartupLimit > 0 {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user